Line data Source code
1 : /*
2 : * Copyright (C) 2007 Oracle. All rights reserved.
3 : *
4 : * This program is free software; you can redistribute it and/or
5 : * modify it under the terms of the GNU General Public
6 : * License v2 as published by the Free Software Foundation.
7 : *
8 : * This program is distributed in the hope that it will be useful,
9 : * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 : * General Public License for more details.
12 : *
13 : * You should have received a copy of the GNU General Public
14 : * License along with this program; if not, write to the
15 : * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 : * Boston, MA 021110-1307, USA.
17 : */
18 : #include <linux/sched.h>
19 : #include <linux/bio.h>
20 : #include <linux/slab.h>
21 : #include <linux/buffer_head.h>
22 : #include <linux/blkdev.h>
23 : #include <linux/random.h>
24 : #include <linux/iocontext.h>
25 : #include <linux/capability.h>
26 : #include <linux/ratelimit.h>
27 : #include <linux/kthread.h>
28 : #include <linux/raid/pq.h>
29 : #include <linux/semaphore.h>
30 : #include <asm/div64.h>
31 : #include "ctree.h"
32 : #include "extent_map.h"
33 : #include "disk-io.h"
34 : #include "transaction.h"
35 : #include "print-tree.h"
36 : #include "volumes.h"
37 : #include "raid56.h"
38 : #include "async-thread.h"
39 : #include "check-integrity.h"
40 : #include "rcu-string.h"
41 : #include "math.h"
42 : #include "dev-replace.h"
43 : #include "sysfs.h"
44 :
45 : static int init_first_rw_device(struct btrfs_trans_handle *trans,
46 : struct btrfs_root *root,
47 : struct btrfs_device *device);
48 : static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
49 : static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
50 : static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
51 : static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
52 :
53 : static DEFINE_MUTEX(uuid_mutex);
54 : static LIST_HEAD(fs_uuids);
55 :
56 : static void lock_chunks(struct btrfs_root *root)
57 : {
58 293 : mutex_lock(&root->fs_info->chunk_mutex);
59 : }
60 :
61 : static void unlock_chunks(struct btrfs_root *root)
62 : {
63 293 : mutex_unlock(&root->fs_info->chunk_mutex);
64 : }
65 :
66 106 : static struct btrfs_fs_devices *__alloc_fs_devices(void)
67 : {
68 : struct btrfs_fs_devices *fs_devs;
69 :
70 106 : fs_devs = kzalloc(sizeof(*fs_devs), GFP_NOFS);
71 106 : if (!fs_devs)
72 : return ERR_PTR(-ENOMEM);
73 :
74 106 : mutex_init(&fs_devs->device_list_mutex);
75 :
76 106 : INIT_LIST_HEAD(&fs_devs->devices);
77 106 : INIT_LIST_HEAD(&fs_devs->alloc_list);
78 106 : INIT_LIST_HEAD(&fs_devs->list);
79 :
80 106 : return fs_devs;
81 : }
82 :
83 : /**
84 : * alloc_fs_devices - allocate struct btrfs_fs_devices
85 : * @fsid: a pointer to UUID for this FS. If NULL a new UUID is
86 : * generated.
87 : *
88 : * Return: a pointer to a new &struct btrfs_fs_devices on success;
89 : * ERR_PTR() on error. Returned struct is not linked onto any lists and
90 : * can be destroyed with kfree() right away.
91 : */
92 106 : static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
93 : {
94 : struct btrfs_fs_devices *fs_devs;
95 :
96 106 : fs_devs = __alloc_fs_devices();
97 106 : if (IS_ERR(fs_devs))
98 : return fs_devs;
99 :
100 106 : if (fsid)
101 106 : memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
102 : else
103 0 : generate_random_uuid(fs_devs->fsid);
104 :
105 : return fs_devs;
106 : }
107 :
108 0 : static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
109 : {
110 : struct btrfs_device *device;
111 0 : WARN_ON(fs_devices->opened);
112 0 : while (!list_empty(&fs_devices->devices)) {
113 : device = list_entry(fs_devices->devices.next,
114 : struct btrfs_device, dev_list);
115 0 : list_del(&device->dev_list);
116 0 : rcu_string_free(device->name);
117 0 : kfree(device);
118 : }
119 0 : kfree(fs_devices);
120 0 : }
121 :
122 0 : static void btrfs_kobject_uevent(struct block_device *bdev,
123 : enum kobject_action action)
124 : {
125 : int ret;
126 :
127 0 : ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
128 0 : if (ret)
129 0 : pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n",
130 : action,
131 : kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
132 : &disk_to_dev(bdev->bd_disk)->kobj);
133 0 : }
134 :
135 0 : void btrfs_cleanup_fs_uuids(void)
136 : {
137 : struct btrfs_fs_devices *fs_devices;
138 :
139 0 : while (!list_empty(&fs_uuids)) {
140 0 : fs_devices = list_entry(fs_uuids.next,
141 : struct btrfs_fs_devices, list);
142 0 : list_del(&fs_devices->list);
143 0 : free_fs_devices(fs_devices);
144 : }
145 0 : }
146 :
147 382 : static struct btrfs_device *__alloc_device(void)
148 : {
149 : struct btrfs_device *dev;
150 :
151 382 : dev = kzalloc(sizeof(*dev), GFP_NOFS);
152 382 : if (!dev)
153 : return ERR_PTR(-ENOMEM);
154 :
155 382 : INIT_LIST_HEAD(&dev->dev_list);
156 382 : INIT_LIST_HEAD(&dev->dev_alloc_list);
157 :
158 382 : spin_lock_init(&dev->io_lock);
159 :
160 382 : spin_lock_init(&dev->reada_lock);
161 : atomic_set(&dev->reada_in_flight, 0);
162 382 : INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_WAIT);
163 382 : INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_WAIT);
164 :
165 382 : return dev;
166 : }
167 :
168 2480 : static noinline struct btrfs_device *__find_device(struct list_head *head,
169 : u64 devid, u8 *uuid)
170 : {
171 : struct btrfs_device *dev;
172 :
173 2994 : list_for_each_entry(dev, head, dev_list) {
174 2950 : if (dev->devid == devid &&
175 2359 : (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
176 : return dev;
177 : }
178 : }
179 : return NULL;
180 : }
181 :
182 613 : static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
183 : {
184 : struct btrfs_fs_devices *fs_devices;
185 :
186 53577 : list_for_each_entry(fs_devices, &fs_uuids, list) {
187 53471 : if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
188 : return fs_devices;
189 : }
190 : return NULL;
191 : }
192 :
193 : static int
194 256 : btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
195 : int flush, struct block_device **bdev,
196 : struct buffer_head **bh)
197 : {
198 : int ret;
199 :
200 256 : *bdev = blkdev_get_by_path(device_path, flags, holder);
201 :
202 256 : if (IS_ERR(*bdev)) {
203 0 : ret = PTR_ERR(*bdev);
204 0 : printk(KERN_INFO "BTRFS: open %s failed\n", device_path);
205 0 : goto error;
206 : }
207 :
208 256 : if (flush)
209 248 : filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
210 256 : ret = set_blocksize(*bdev, 4096);
211 256 : if (ret) {
212 0 : blkdev_put(*bdev, flags);
213 0 : goto error;
214 : }
215 256 : invalidate_bdev(*bdev);
216 256 : *bh = btrfs_read_dev_super(*bdev);
217 256 : if (!*bh) {
218 : ret = -EINVAL;
219 0 : blkdev_put(*bdev, flags);
220 0 : goto error;
221 : }
222 :
223 : return 0;
224 :
225 : error:
226 0 : *bdev = NULL;
227 0 : *bh = NULL;
228 0 : return ret;
229 : }
230 :
231 : static void requeue_list(struct btrfs_pending_bios *pending_bios,
232 : struct bio *head, struct bio *tail)
233 : {
234 :
235 : struct bio *old_head;
236 :
237 0 : old_head = pending_bios->head;
238 0 : pending_bios->head = head;
239 0 : if (pending_bios->tail)
240 0 : tail->bi_next = old_head;
241 : else
242 0 : pending_bios->tail = tail;
243 : }
244 :
245 : /*
246 : * we try to collect pending bios for a device so we don't get a large
247 : * number of procs sending bios down to the same device. This greatly
248 : * improves the schedulers ability to collect and merge the bios.
249 : *
250 : * But, it also turns into a long list of bios to process and that is sure
251 : * to eventually make the worker thread block. The solution here is to
252 : * make some progress and then put this work struct back at the end of
253 : * the list if the block device is congested. This way, multiple devices
254 : * can make progress from a single worker thread.
255 : */
256 7328 : static noinline void run_scheduled_bios(struct btrfs_device *device)
257 : {
258 : struct bio *pending;
259 : struct backing_dev_info *bdi;
260 : struct btrfs_fs_info *fs_info;
261 : struct btrfs_pending_bios *pending_bios;
262 : struct bio *tail;
263 : struct bio *cur;
264 : int again = 0;
265 : unsigned long num_run;
266 : unsigned long batch_run = 0;
267 : unsigned long limit;
268 : unsigned long last_waited = 0;
269 : int force_reg = 0;
270 : int sync_pending = 0;
271 : struct blk_plug plug;
272 :
273 : /*
274 : * this function runs all the bios we've collected for
275 : * a particular device. We don't want to wander off to
276 : * another device without first sending all of these down.
277 : * So, setup a plug here and finish it off before we return
278 : */
279 7328 : blk_start_plug(&plug);
280 :
281 7327 : bdi = blk_get_backing_dev_info(device->bdev);
282 7328 : fs_info = device->dev_root->fs_info;
283 7328 : limit = btrfs_async_submit_limit(fs_info);
284 7326 : limit = limit * 2 / 3;
285 :
286 : loop:
287 : spin_lock(&device->io_lock);
288 :
289 : loop_lock:
290 : num_run = 0;
291 :
292 : /* take all the bios off the list at once and process them
293 : * later on (without the lock held). But, remember the
294 : * tail and other pointers so the bios can be properly reinserted
295 : * into the list if we hit congestion
296 : */
297 15383 : if (!force_reg && device->pending_sync_bios.head) {
298 552 : pending_bios = &device->pending_sync_bios;
299 552 : force_reg = 1;
300 : } else {
301 14831 : pending_bios = &device->pending_bios;
302 : force_reg = 0;
303 : }
304 :
305 15383 : pending = pending_bios->head;
306 15383 : tail = pending_bios->tail;
307 15383 : WARN_ON(pending && !tail);
308 :
309 : /*
310 : * if pending was null this time around, no bios need processing
311 : * at all and we can stop. Otherwise it'll loop back up again
312 : * and do an additional check so no bios are missed.
313 : *
314 : * device->running_pending is used to synchronize with the
315 : * schedule_bio code.
316 : */
317 30177 : if (device->pending_sync_bios.head == NULL &&
318 14793 : device->pending_bios.head == NULL) {
319 : again = 0;
320 7344 : device->running_pending = 0;
321 : } else {
322 : again = 1;
323 8040 : device->running_pending = 1;
324 : }
325 :
326 15384 : pending_bios->head = NULL;
327 15384 : pending_bios->tail = NULL;
328 :
329 : spin_unlock(&device->io_lock);
330 :
331 121316 : while (pending) {
332 :
333 105934 : rmb();
334 : /* we want to work on both lists, but do more bios on the
335 : * sync list than the regular list
336 : */
337 184469 : if ((num_run > 32 &&
338 157070 : pending_bios != &device->pending_sync_bios &&
339 184471 : device->pending_sync_bios.head) ||
340 74100 : (num_run > 64 && pending_bios == &device->pending_sync_bios &&
341 0 : device->pending_bios.head)) {
342 : spin_lock(&device->io_lock);
343 : requeue_list(pending_bios, pending, tail);
344 : goto loop_lock;
345 : }
346 :
347 : cur = pending;
348 105936 : pending = pending->bi_next;
349 105936 : cur->bi_next = NULL;
350 :
351 243661 : if (atomic_dec_return(&fs_info->nr_async_bios) < limit &&
352 : waitqueue_active(&fs_info->async_submit_wait))
353 18 : wake_up(&fs_info->async_submit_wait);
354 :
355 105948 : BUG_ON(atomic_read(&cur->bi_cnt) == 0);
356 :
357 : /*
358 : * if we're doing the sync list, record that our
359 : * plug has some sync requests on it
360 : *
361 : * If we're doing the regular list and there are
362 : * sync requests sitting around, unplug before
363 : * we add more
364 : */
365 105948 : if (pending_bios == &device->pending_sync_bios) {
366 : sync_pending = 1;
367 105320 : } else if (sync_pending) {
368 30 : blk_finish_plug(&plug);
369 30 : blk_start_plug(&plug);
370 : sync_pending = 0;
371 : }
372 :
373 105948 : btrfsic_submit_bio(cur->bi_rw, cur);
374 105926 : num_run++;
375 105926 : batch_run++;
376 105933 : if (need_resched())
377 35 : cond_resched();
378 :
379 : /*
380 : * we made progress, there is more work to do and the bdi
381 : * is now congested. Back off and let other work structs
382 : * run instead
383 : */
384 203871 : if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
385 0 : fs_info->fs_devices->open_devices > 1) {
386 : struct io_context *ioc;
387 :
388 0 : ioc = current->io_context;
389 :
390 : /*
391 : * the main goal here is that we don't want to
392 : * block if we're going to be able to submit
393 : * more requests without blocking.
394 : *
395 : * This code does two great things, it pokes into
396 : * the elevator code from a filesystem _and_
397 : * it makes assumptions about how batching works.
398 : */
399 0 : if (ioc && ioc->nr_batch_requests > 0 &&
400 0 : time_before(jiffies, ioc->last_waited + HZ/50UL) &&
401 0 : (last_waited == 0 ||
402 : ioc->last_waited == last_waited)) {
403 : /*
404 : * we want to go through our batch of
405 : * requests and stop. So, we copy out
406 : * the ioc->last_waited time and test
407 : * against it before looping
408 : */
409 : last_waited = ioc->last_waited;
410 0 : if (need_resched())
411 0 : cond_resched();
412 0 : continue;
413 : }
414 : spin_lock(&device->io_lock);
415 : requeue_list(pending_bios, pending, tail);
416 0 : device->running_pending = 1;
417 :
418 : spin_unlock(&device->io_lock);
419 0 : btrfs_queue_work(fs_info->submit_workers,
420 : &device->work);
421 0 : goto done;
422 : }
423 : /* unplug every 64 requests just for good measure */
424 105939 : if (batch_run % 64 == 0) {
425 1285 : blk_finish_plug(&plug);
426 1285 : blk_start_plug(&plug);
427 : sync_pending = 0;
428 : }
429 : }
430 :
431 15382 : cond_resched();
432 15382 : if (again)
433 : goto loop;
434 :
435 : spin_lock(&device->io_lock);
436 7344 : if (device->pending_bios.head || device->pending_sync_bios.head)
437 : goto loop_lock;
438 : spin_unlock(&device->io_lock);
439 :
440 : done:
441 7328 : blk_finish_plug(&plug);
442 7328 : }
443 :
444 7328 : static void pending_bios_fn(struct btrfs_work *work)
445 : {
446 : struct btrfs_device *device;
447 :
448 7328 : device = container_of(work, struct btrfs_device, work);
449 7328 : run_scheduled_bios(device);
450 7328 : }
451 :
452 : /*
453 : * Add new device to list of registered devices
454 : *
455 : * Returns:
456 : * 1 - first time device is seen
457 : * 0 - device already known
458 : * < 0 - error
459 : */
460 613 : static noinline int device_list_add(const char *path,
461 : struct btrfs_super_block *disk_super,
462 : u64 devid, struct btrfs_fs_devices **fs_devices_ret)
463 : {
464 : struct btrfs_device *device;
465 : struct btrfs_fs_devices *fs_devices;
466 : struct rcu_string *name;
467 : int ret = 0;
468 : u64 found_transid = btrfs_super_generation(disk_super);
469 :
470 613 : fs_devices = find_fsid(disk_super->fsid);
471 613 : if (!fs_devices) {
472 106 : fs_devices = alloc_fs_devices(disk_super->fsid);
473 106 : if (IS_ERR(fs_devices))
474 0 : return PTR_ERR(fs_devices);
475 :
476 106 : list_add(&fs_devices->list, &fs_uuids);
477 106 : fs_devices->latest_devid = devid;
478 106 : fs_devices->latest_trans = found_transid;
479 :
480 : device = NULL;
481 : } else {
482 507 : device = __find_device(&fs_devices->devices, devid,
483 507 : disk_super->dev_item.uuid);
484 : }
485 613 : if (!device) {
486 126 : if (fs_devices->opened)
487 : return -EBUSY;
488 :
489 126 : device = btrfs_alloc_device(NULL, &devid,
490 126 : disk_super->dev_item.uuid);
491 126 : if (IS_ERR(device)) {
492 : /* we can safely leave the fs_devices entry around */
493 0 : return PTR_ERR(device);
494 : }
495 :
496 126 : name = rcu_string_strdup(path, GFP_NOFS);
497 126 : if (!name) {
498 0 : kfree(device);
499 0 : return -ENOMEM;
500 : }
501 126 : rcu_assign_pointer(device->name, name);
502 :
503 126 : mutex_lock(&fs_devices->device_list_mutex);
504 126 : list_add_rcu(&device->dev_list, &fs_devices->devices);
505 126 : fs_devices->num_devices++;
506 126 : mutex_unlock(&fs_devices->device_list_mutex);
507 :
508 : ret = 1;
509 126 : device->fs_devices = fs_devices;
510 487 : } else if (!device->name || strcmp(device->name->str, path)) {
511 : /*
512 : * When FS is already mounted.
513 : * 1. If you are here and if the device->name is NULL that
514 : * means this device was missing at time of FS mount.
515 : * 2. If you are here and if the device->name is different
516 : * from 'path' that means either
517 : * a. The same device disappeared and reappeared with
518 : * different name. or
519 : * b. The missing-disk-which-was-replaced, has
520 : * reappeared now.
521 : *
522 : * We must allow 1 and 2a above. But 2b would be a spurious
523 : * and unintentional.
524 : *
525 : * Further in case of 1 and 2a above, the disk at 'path'
526 : * would have missed some transaction when it was away and
527 : * in case of 2a the stale bdev has to be updated as well.
528 : * 2b must not be allowed at all time.
529 : */
530 :
531 : /*
532 : * For now, we do allow update to btrfs_fs_device through the
533 : * btrfs dev scan cli after FS has been mounted. We're still
534 : * tracking a problem where systems fail mount by subvolume id
535 : * when we reject replacement on a mounted FS.
536 : */
537 0 : if (!fs_devices->opened && found_transid < device->generation) {
538 : /*
539 : * That is if the FS is _not_ mounted and if you
540 : * are here, that means there is more than one
541 : * disk with same uuid and devid.We keep the one
542 : * with larger generation number or the last-in if
543 : * generation are equal.
544 : */
545 : return -EEXIST;
546 : }
547 :
548 0 : name = rcu_string_strdup(path, GFP_NOFS);
549 0 : if (!name)
550 : return -ENOMEM;
551 0 : rcu_string_free(device->name);
552 0 : rcu_assign_pointer(device->name, name);
553 0 : if (device->missing) {
554 0 : fs_devices->missing_devices--;
555 0 : device->missing = 0;
556 : }
557 : }
558 :
559 : /*
560 : * Unmount does not free the btrfs_device struct but would zero
561 : * generation along with most of the other members. So just update
562 : * it back. We need it to pick the disk with largest generation
563 : * (as above).
564 : */
565 613 : if (!fs_devices->opened)
566 498 : device->generation = found_transid;
567 :
568 613 : if (found_transid > fs_devices->latest_trans) {
569 312 : fs_devices->latest_devid = devid;
570 312 : fs_devices->latest_trans = found_transid;
571 : }
572 613 : *fs_devices_ret = fs_devices;
573 :
574 613 : return ret;
575 : }
576 :
577 0 : static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
578 : {
579 : struct btrfs_fs_devices *fs_devices;
580 : struct btrfs_device *device;
581 : struct btrfs_device *orig_dev;
582 :
583 0 : fs_devices = alloc_fs_devices(orig->fsid);
584 0 : if (IS_ERR(fs_devices))
585 : return fs_devices;
586 :
587 0 : fs_devices->latest_devid = orig->latest_devid;
588 0 : fs_devices->latest_trans = orig->latest_trans;
589 0 : fs_devices->total_devices = orig->total_devices;
590 :
591 : /* We have held the volume lock, it is safe to get the devices. */
592 0 : list_for_each_entry(orig_dev, &orig->devices, dev_list) {
593 : struct rcu_string *name;
594 :
595 0 : device = btrfs_alloc_device(NULL, &orig_dev->devid,
596 0 : orig_dev->uuid);
597 0 : if (IS_ERR(device))
598 : goto error;
599 :
600 : /*
601 : * This is ok to do without rcu read locked because we hold the
602 : * uuid mutex so nothing we touch in here is going to disappear.
603 : */
604 0 : if (orig_dev->name) {
605 0 : name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS);
606 0 : if (!name) {
607 0 : kfree(device);
608 0 : goto error;
609 : }
610 0 : rcu_assign_pointer(device->name, name);
611 : }
612 :
613 0 : list_add(&device->dev_list, &fs_devices->devices);
614 0 : device->fs_devices = fs_devices;
615 0 : fs_devices->num_devices++;
616 : }
617 : return fs_devices;
618 : error:
619 0 : free_fs_devices(fs_devices);
620 0 : return ERR_PTR(-ENOMEM);
621 : }
622 :
623 442 : void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
624 : struct btrfs_fs_devices *fs_devices, int step)
625 : {
626 : struct btrfs_device *device, *next;
627 :
628 : struct block_device *latest_bdev = NULL;
629 : u64 latest_devid = 0;
630 : u64 latest_transid = 0;
631 :
632 442 : mutex_lock(&uuid_mutex);
633 : again:
634 : /* This is the initialized path, it is safe to release the devices. */
635 938 : list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
636 496 : if (device->in_fs_metadata) {
637 496 : if (!device->is_tgtdev_for_dev_replace &&
638 54 : (!latest_transid ||
639 54 : device->generation > latest_transid)) {
640 442 : latest_devid = device->devid;
641 442 : latest_transid = device->generation;
642 442 : latest_bdev = device->bdev;
643 : }
644 496 : continue;
645 : }
646 :
647 0 : if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
648 : /*
649 : * In the first step, keep the device which has
650 : * the correct fsid and the devid that is used
651 : * for the dev_replace procedure.
652 : * In the second step, the dev_replace state is
653 : * read from the device tree and it is known
654 : * whether the procedure is really active or
655 : * not, which means whether this device is
656 : * used or whether it should be removed.
657 : */
658 0 : if (step == 0 || device->is_tgtdev_for_dev_replace) {
659 0 : continue;
660 : }
661 : }
662 0 : if (device->bdev) {
663 0 : blkdev_put(device->bdev, device->mode);
664 0 : device->bdev = NULL;
665 0 : fs_devices->open_devices--;
666 : }
667 0 : if (device->writeable) {
668 0 : list_del_init(&device->dev_alloc_list);
669 0 : device->writeable = 0;
670 0 : if (!device->is_tgtdev_for_dev_replace)
671 0 : fs_devices->rw_devices--;
672 : }
673 : list_del_init(&device->dev_list);
674 0 : fs_devices->num_devices--;
675 0 : rcu_string_free(device->name);
676 0 : kfree(device);
677 : }
678 :
679 442 : if (fs_devices->seed) {
680 : fs_devices = fs_devices->seed;
681 : goto again;
682 : }
683 :
684 442 : fs_devices->latest_bdev = latest_bdev;
685 442 : fs_devices->latest_devid = latest_devid;
686 442 : fs_devices->latest_trans = latest_transid;
687 :
688 442 : mutex_unlock(&uuid_mutex);
689 442 : }
690 :
691 256 : static void __free_device(struct work_struct *work)
692 : {
693 : struct btrfs_device *device;
694 :
695 256 : device = container_of(work, struct btrfs_device, rcu_work);
696 :
697 256 : if (device->bdev)
698 256 : blkdev_put(device->bdev, device->mode);
699 :
700 256 : rcu_string_free(device->name);
701 256 : kfree(device);
702 256 : }
703 :
704 256 : static void free_device(struct rcu_head *head)
705 : {
706 : struct btrfs_device *device;
707 :
708 : device = container_of(head, struct btrfs_device, rcu);
709 :
710 512 : INIT_WORK(&device->rcu_work, __free_device);
711 256 : schedule_work(&device->rcu_work);
712 256 : }
713 :
714 223 : static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
715 : {
716 : struct btrfs_device *device;
717 :
718 223 : if (--fs_devices->opened > 0)
719 : return 0;
720 :
721 221 : mutex_lock(&fs_devices->device_list_mutex);
722 469 : list_for_each_entry(device, &fs_devices->devices, dev_list) {
723 : struct btrfs_device *new_device;
724 : struct rcu_string *name;
725 :
726 248 : if (device->bdev)
727 248 : fs_devices->open_devices--;
728 :
729 496 : if (device->writeable &&
730 248 : device->devid != BTRFS_DEV_REPLACE_DEVID) {
731 248 : list_del_init(&device->dev_alloc_list);
732 248 : fs_devices->rw_devices--;
733 : }
734 :
735 248 : if (device->can_discard)
736 0 : fs_devices->num_can_discard--;
737 248 : if (device->missing)
738 0 : fs_devices->missing_devices--;
739 :
740 248 : new_device = btrfs_alloc_device(NULL, &device->devid,
741 248 : device->uuid);
742 248 : BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
743 :
744 : /* Safe because we are under uuid_mutex */
745 248 : if (device->name) {
746 248 : name = rcu_string_strdup(device->name->str, GFP_NOFS);
747 248 : BUG_ON(!name); /* -ENOMEM */
748 248 : rcu_assign_pointer(new_device->name, name);
749 : }
750 :
751 248 : list_replace_rcu(&device->dev_list, &new_device->dev_list);
752 248 : new_device->fs_devices = device->fs_devices;
753 :
754 248 : call_rcu(&device->rcu, free_device);
755 : }
756 221 : mutex_unlock(&fs_devices->device_list_mutex);
757 :
758 221 : WARN_ON(fs_devices->open_devices);
759 221 : WARN_ON(fs_devices->rw_devices);
760 221 : fs_devices->opened = 0;
761 221 : fs_devices->seeding = 0;
762 :
763 221 : return 0;
764 : }
765 :
766 223 : int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
767 : {
768 : struct btrfs_fs_devices *seed_devices = NULL;
769 : int ret;
770 :
771 223 : mutex_lock(&uuid_mutex);
772 223 : ret = __btrfs_close_devices(fs_devices);
773 223 : if (!fs_devices->opened) {
774 221 : seed_devices = fs_devices->seed;
775 221 : fs_devices->seed = NULL;
776 : }
777 223 : mutex_unlock(&uuid_mutex);
778 :
779 446 : while (seed_devices) {
780 : fs_devices = seed_devices;
781 0 : seed_devices = fs_devices->seed;
782 0 : __btrfs_close_devices(fs_devices);
783 0 : free_fs_devices(fs_devices);
784 : }
785 : /*
786 : * Wait for rcu kworkers under __btrfs_close_devices
787 : * to finish all blkdev_puts so device is really
788 : * free when umount is done.
789 : */
790 223 : rcu_barrier();
791 223 : return ret;
792 : }
793 :
794 221 : static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
795 : fmode_t flags, void *holder)
796 : {
797 : struct request_queue *q;
798 : struct block_device *bdev;
799 221 : struct list_head *head = &fs_devices->devices;
800 : struct btrfs_device *device;
801 : struct block_device *latest_bdev = NULL;
802 : struct buffer_head *bh;
803 : struct btrfs_super_block *disk_super;
804 : u64 latest_devid = 0;
805 : u64 latest_transid = 0;
806 : u64 devid;
807 : int seeding = 1;
808 : int ret = 0;
809 :
810 221 : flags |= FMODE_EXCL;
811 :
812 469 : list_for_each_entry(device, head, dev_list) {
813 248 : if (device->bdev)
814 0 : continue;
815 248 : if (!device->name)
816 0 : continue;
817 :
818 : /* Just open everything we can; ignore failures here */
819 248 : if (btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
820 : &bdev, &bh))
821 0 : continue;
822 :
823 248 : disk_super = (struct btrfs_super_block *)bh->b_data;
824 : devid = btrfs_stack_device_id(&disk_super->dev_item);
825 248 : if (devid != device->devid)
826 : goto error_brelse;
827 :
828 248 : if (memcmp(device->uuid, disk_super->dev_item.uuid,
829 : BTRFS_UUID_SIZE))
830 : goto error_brelse;
831 :
832 248 : device->generation = btrfs_super_generation(disk_super);
833 248 : if (!latest_transid || device->generation > latest_transid) {
834 : latest_devid = devid;
835 : latest_transid = device->generation;
836 221 : latest_bdev = bdev;
837 : }
838 :
839 248 : if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
840 0 : device->writeable = 0;
841 : } else {
842 744 : device->writeable = !bdev_read_only(bdev);
843 : seeding = 0;
844 : }
845 :
846 248 : q = bdev_get_queue(bdev);
847 248 : if (blk_queue_discard(q)) {
848 0 : device->can_discard = 1;
849 0 : fs_devices->num_can_discard++;
850 : }
851 :
852 248 : device->bdev = bdev;
853 248 : device->in_fs_metadata = 0;
854 248 : device->mode = flags;
855 :
856 248 : if (!blk_queue_nonrot(bdev_get_queue(bdev)))
857 248 : fs_devices->rotating = 1;
858 :
859 248 : fs_devices->open_devices++;
860 496 : if (device->writeable &&
861 248 : device->devid != BTRFS_DEV_REPLACE_DEVID) {
862 248 : fs_devices->rw_devices++;
863 248 : list_add(&device->dev_alloc_list,
864 : &fs_devices->alloc_list);
865 : }
866 248 : brelse(bh);
867 248 : continue;
868 :
869 : error_brelse:
870 : brelse(bh);
871 0 : blkdev_put(bdev, flags);
872 0 : continue;
873 : }
874 221 : if (fs_devices->open_devices == 0) {
875 : ret = -EINVAL;
876 : goto out;
877 : }
878 221 : fs_devices->seeding = seeding;
879 221 : fs_devices->opened = 1;
880 221 : fs_devices->latest_bdev = latest_bdev;
881 221 : fs_devices->latest_devid = latest_devid;
882 221 : fs_devices->latest_trans = latest_transid;
883 221 : fs_devices->total_rw_bytes = 0;
884 : out:
885 221 : return ret;
886 : }
887 :
888 223 : int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
889 : fmode_t flags, void *holder)
890 : {
891 : int ret;
892 :
893 223 : mutex_lock(&uuid_mutex);
894 223 : if (fs_devices->opened) {
895 2 : fs_devices->opened++;
896 : ret = 0;
897 : } else {
898 221 : ret = __btrfs_open_devices(fs_devices, flags, holder);
899 : }
900 223 : mutex_unlock(&uuid_mutex);
901 223 : return ret;
902 : }
903 :
904 : /*
905 : * Look for a btrfs signature on a device. This may be called out of the mount path
906 : * and we are not allowed to call set_blocksize during the scan. The superblock
907 : * is read via pagecache
908 : */
909 617 : int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
910 : struct btrfs_fs_devices **fs_devices_ret)
911 : {
912 : struct btrfs_super_block *disk_super;
913 : struct block_device *bdev;
914 : struct page *page;
915 : void *p;
916 : int ret = -EINVAL;
917 : u64 devid;
918 : u64 transid;
919 : u64 total_devices;
920 : u64 bytenr;
921 : pgoff_t index;
922 :
923 : /*
924 : * we would like to check all the supers, but that would make
925 : * a btrfs mount succeed after a mkfs from a different FS.
926 : * So, we need to add a special mount option to scan for
927 : * later supers, using BTRFS_SUPER_MIRROR_MAX instead
928 : */
929 : bytenr = btrfs_sb_offset(0);
930 617 : flags |= FMODE_EXCL;
931 617 : mutex_lock(&uuid_mutex);
932 :
933 617 : bdev = blkdev_get_by_path(path, flags, holder);
934 :
935 617 : if (IS_ERR(bdev)) {
936 0 : ret = PTR_ERR(bdev);
937 0 : goto error;
938 : }
939 :
940 : /* make sure our super fits in the device */
941 617 : if (bytenr + PAGE_CACHE_SIZE >= i_size_read(bdev->bd_inode))
942 : goto error_bdev_put;
943 :
944 : /* make sure our super fits in the page */
945 : if (sizeof(*disk_super) > PAGE_CACHE_SIZE)
946 : goto error_bdev_put;
947 :
948 : /* make sure our super doesn't straddle pages on disk */
949 : index = bytenr >> PAGE_CACHE_SHIFT;
950 : if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_CACHE_SHIFT != index)
951 : goto error_bdev_put;
952 :
953 : /* pull in the page with our super */
954 617 : page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
955 : index, GFP_NOFS);
956 :
957 617 : if (IS_ERR_OR_NULL(page))
958 : goto error_bdev_put;
959 :
960 : p = kmap(page);
961 :
962 : /* align our pointer to the offset of the super block */
963 : disk_super = p + (bytenr & ~PAGE_CACHE_MASK);
964 :
965 1230 : if (btrfs_super_bytenr(disk_super) != bytenr ||
966 : btrfs_super_magic(disk_super) != BTRFS_MAGIC)
967 : goto error_unmap;
968 :
969 : devid = btrfs_stack_device_id(&disk_super->dev_item);
970 : transid = btrfs_super_generation(disk_super);
971 : total_devices = btrfs_super_num_devices(disk_super);
972 :
973 613 : ret = device_list_add(path, disk_super, devid, fs_devices_ret);
974 613 : if (ret > 0) {
975 126 : if (disk_super->label[0]) {
976 0 : if (disk_super->label[BTRFS_LABEL_SIZE - 1])
977 0 : disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0';
978 0 : printk(KERN_INFO "BTRFS: device label %s ", disk_super->label);
979 : } else {
980 126 : printk(KERN_INFO "BTRFS: device fsid %pU ", disk_super->fsid);
981 : }
982 :
983 126 : printk(KERN_CONT "devid %llu transid %llu %s\n", devid, transid, path);
984 : ret = 0;
985 : }
986 613 : if (!ret && fs_devices_ret)
987 613 : (*fs_devices_ret)->total_devices = total_devices;
988 :
989 : error_unmap:
990 : kunmap(page);
991 617 : page_cache_release(page);
992 :
993 : error_bdev_put:
994 617 : blkdev_put(bdev, flags);
995 : error:
996 617 : mutex_unlock(&uuid_mutex);
997 617 : return ret;
998 : }
999 :
1000 : /* helper to account the used device space in the range */
1001 48490 : int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
1002 : u64 end, u64 *length)
1003 : {
1004 : struct btrfs_key key;
1005 48490 : struct btrfs_root *root = device->dev_root;
1006 : struct btrfs_dev_extent *dev_extent;
1007 : struct btrfs_path *path;
1008 : u64 extent_end;
1009 : int ret;
1010 : int slot;
1011 48510 : struct extent_buffer *l;
1012 :
1013 48490 : *length = 0;
1014 :
1015 48490 : if (start >= device->total_bytes || device->is_tgtdev_for_dev_replace)
1016 : return 0;
1017 :
1018 48490 : path = btrfs_alloc_path();
1019 48490 : if (!path)
1020 : return -ENOMEM;
1021 48490 : path->reada = 2;
1022 :
1023 48490 : key.objectid = device->devid;
1024 48490 : key.offset = start;
1025 48490 : key.type = BTRFS_DEV_EXTENT_KEY;
1026 :
1027 48490 : ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1028 48490 : if (ret < 0)
1029 : goto out;
1030 48490 : if (ret > 0) {
1031 20 : ret = btrfs_previous_item(root, path, key.objectid, key.type);
1032 20 : if (ret < 0)
1033 : goto out;
1034 : }
1035 :
1036 : while (1) {
1037 48510 : l = path->nodes[0];
1038 48510 : slot = path->slots[0];
1039 97020 : if (slot >= btrfs_header_nritems(l)) {
1040 0 : ret = btrfs_next_leaf(root, path);
1041 0 : if (ret == 0)
1042 0 : continue;
1043 0 : if (ret < 0)
1044 : goto out;
1045 :
1046 : break;
1047 : }
1048 48510 : btrfs_item_key_to_cpu(l, &key, slot);
1049 :
1050 48510 : if (key.objectid < device->devid)
1051 : goto next;
1052 :
1053 48490 : if (key.objectid > device->devid)
1054 : break;
1055 :
1056 48490 : if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
1057 : goto next;
1058 :
1059 48490 : dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1060 96980 : extent_end = key.offset + btrfs_dev_extent_length(l,
1061 : dev_extent);
1062 48490 : if (key.offset <= start && extent_end > end) {
1063 48470 : *length = end - start + 1;
1064 48470 : break;
1065 20 : } else if (key.offset <= start && extent_end > start)
1066 0 : *length += extent_end - start;
1067 20 : else if (key.offset > start && extent_end <= end)
1068 0 : *length += extent_end - key.offset;
1069 20 : else if (key.offset > start && key.offset <= end) {
1070 0 : *length += end - key.offset + 1;
1071 0 : break;
1072 20 : } else if (key.offset > end)
1073 : break;
1074 :
1075 : next:
1076 20 : path->slots[0]++;
1077 : }
1078 : ret = 0;
1079 : out:
1080 48490 : btrfs_free_path(path);
1081 48490 : return ret;
1082 : }
1083 :
1084 185 : static int contains_pending_extent(struct btrfs_trans_handle *trans,
1085 : struct btrfs_device *device,
1086 : u64 *start, u64 len)
1087 : {
1088 : struct extent_map *em;
1089 : int ret = 0;
1090 :
1091 191 : list_for_each_entry(em, &trans->transaction->pending_chunks, list) {
1092 : struct map_lookup *map;
1093 : int i;
1094 :
1095 6 : map = (struct map_lookup *)em->bdev;
1096 10 : for (i = 0; i < map->num_stripes; i++) {
1097 10 : if (map->stripes[i].dev != device)
1098 4 : continue;
1099 12 : if (map->stripes[i].physical >= *start + len ||
1100 6 : map->stripes[i].physical + em->orig_block_len <=
1101 : *start)
1102 3 : continue;
1103 3 : *start = map->stripes[i].physical +
1104 : em->orig_block_len;
1105 : ret = 1;
1106 : }
1107 : }
1108 :
1109 185 : return ret;
1110 : }
1111 :
1112 :
1113 : /*
1114 : * find_free_dev_extent - find free space in the specified device
1115 : * @device: the device which we search the free space in
1116 : * @num_bytes: the size of the free space that we need
1117 : * @start: store the start of the free space.
1118 : * @len: the size of the free space. that we find, or the size of the max
1119 : * free space if we don't find suitable free space
1120 : *
1121 : * this uses a pretty simple search, the expectation is that it is
1122 : * called very infrequently and that a given device has a small number
1123 : * of extents
1124 : *
1125 : * @start is used to store the start of the free space if we find. But if we
1126 : * don't find suitable free space, it will be used to store the start position
1127 : * of the max free space.
1128 : *
1129 : * @len is used to store the size of the free space that we find.
1130 : * But if we don't find suitable free space, it is used to store the size of
1131 : * the max free space.
1132 : */
1133 149 : int find_free_dev_extent(struct btrfs_trans_handle *trans,
1134 : struct btrfs_device *device, u64 num_bytes,
1135 : u64 *start, u64 *len)
1136 : {
1137 : struct btrfs_key key;
1138 149 : struct btrfs_root *root = device->dev_root;
1139 : struct btrfs_dev_extent *dev_extent;
1140 : struct btrfs_path *path;
1141 : u64 hole_size;
1142 : u64 max_hole_start;
1143 : u64 max_hole_size;
1144 : u64 extent_end;
1145 : u64 search_start;
1146 149 : u64 search_end = device->total_bytes;
1147 : int ret;
1148 : int slot;
1149 742 : struct extent_buffer *l;
1150 :
1151 : /* FIXME use last free of some kind */
1152 :
1153 : /* we don't want to overwrite the superblock on the drive,
1154 : * so we make sure to start at an offset of at least 1MB
1155 : */
1156 149 : search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
1157 :
1158 149 : path = btrfs_alloc_path();
1159 149 : if (!path)
1160 : return -ENOMEM;
1161 : again:
1162 152 : max_hole_start = search_start;
1163 : max_hole_size = 0;
1164 : hole_size = 0;
1165 :
1166 152 : if (search_start >= search_end || device->is_tgtdev_for_dev_replace) {
1167 : ret = -ENOSPC;
1168 : goto out;
1169 : }
1170 :
1171 152 : path->reada = 2;
1172 152 : path->search_commit_root = 1;
1173 152 : path->skip_locking = 1;
1174 :
1175 152 : key.objectid = device->devid;
1176 152 : key.offset = search_start;
1177 152 : key.type = BTRFS_DEV_EXTENT_KEY;
1178 :
1179 152 : ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1180 152 : if (ret < 0)
1181 : goto out;
1182 152 : if (ret > 0) {
1183 80 : ret = btrfs_previous_item(root, path, key.objectid, key.type);
1184 80 : if (ret < 0)
1185 : goto out;
1186 : }
1187 :
1188 : while (1) {
1189 742 : l = path->nodes[0];
1190 742 : slot = path->slots[0];
1191 1484 : if (slot >= btrfs_header_nritems(l)) {
1192 55 : ret = btrfs_next_leaf(root, path);
1193 55 : if (ret == 0)
1194 0 : continue;
1195 55 : if (ret < 0)
1196 : goto out;
1197 :
1198 : break;
1199 : }
1200 687 : btrfs_item_key_to_cpu(l, &key, slot);
1201 :
1202 687 : if (key.objectid < device->devid)
1203 : goto next;
1204 :
1205 643 : if (key.objectid > device->devid)
1206 : break;
1207 :
1208 640 : if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
1209 : goto next;
1210 :
1211 640 : if (key.offset > search_start) {
1212 127 : hole_size = key.offset - search_start;
1213 :
1214 : /*
1215 : * Have to check before we set max_hole_start, otherwise
1216 : * we could end up sending back this offset anyway.
1217 : */
1218 127 : if (contains_pending_extent(trans, device,
1219 : &search_start,
1220 : hole_size))
1221 : hole_size = 0;
1222 :
1223 127 : if (hole_size > max_hole_size) {
1224 125 : max_hole_start = search_start;
1225 : max_hole_size = hole_size;
1226 : }
1227 :
1228 : /*
1229 : * If this free space is greater than which we need,
1230 : * it must be the max free space that we have found
1231 : * until now, so max_hole_start must point to the start
1232 : * of this free space and the length of this free space
1233 : * is stored in max_hole_size. Thus, we return
1234 : * max_hole_start and max_hole_size and go back to the
1235 : * caller.
1236 : */
1237 127 : if (hole_size >= num_bytes) {
1238 : ret = 0;
1239 : goto out;
1240 : }
1241 : }
1242 :
1243 546 : dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1244 1092 : extent_end = key.offset + btrfs_dev_extent_length(l,
1245 : dev_extent);
1246 546 : if (extent_end > search_start)
1247 543 : search_start = extent_end;
1248 : next:
1249 590 : path->slots[0]++;
1250 590 : cond_resched();
1251 : }
1252 :
1253 : /*
1254 : * At this point, search_start should be the end of
1255 : * allocated dev extents, and when shrinking the device,
1256 : * search_end may be smaller than search_start.
1257 : */
1258 58 : if (search_end > search_start)
1259 58 : hole_size = search_end - search_start;
1260 :
1261 58 : if (hole_size > max_hole_size) {
1262 : max_hole_start = search_start;
1263 : max_hole_size = hole_size;
1264 : }
1265 :
1266 58 : if (contains_pending_extent(trans, device, &search_start, hole_size)) {
1267 3 : btrfs_release_path(path);
1268 3 : goto again;
1269 : }
1270 :
1271 : /* See above. */
1272 55 : if (hole_size < num_bytes)
1273 : ret = -ENOSPC;
1274 : else
1275 : ret = 0;
1276 :
1277 : out:
1278 149 : btrfs_free_path(path);
1279 149 : *start = max_hole_start;
1280 149 : if (len)
1281 89 : *len = max_hole_size;
1282 149 : return ret;
1283 : }
1284 :
1285 116 : static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1286 : struct btrfs_device *device,
1287 : u64 start)
1288 : {
1289 : int ret;
1290 : struct btrfs_path *path;
1291 116 : struct btrfs_root *root = device->dev_root;
1292 : struct btrfs_key key;
1293 : struct btrfs_key found_key;
1294 : struct extent_buffer *leaf = NULL;
1295 : struct btrfs_dev_extent *extent = NULL;
1296 :
1297 116 : path = btrfs_alloc_path();
1298 116 : if (!path)
1299 : return -ENOMEM;
1300 :
1301 116 : key.objectid = device->devid;
1302 116 : key.offset = start;
1303 116 : key.type = BTRFS_DEV_EXTENT_KEY;
1304 : again:
1305 116 : ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1306 116 : if (ret > 0) {
1307 0 : ret = btrfs_previous_item(root, path, key.objectid,
1308 : BTRFS_DEV_EXTENT_KEY);
1309 0 : if (ret)
1310 : goto out;
1311 0 : leaf = path->nodes[0];
1312 0 : btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1313 0 : extent = btrfs_item_ptr(leaf, path->slots[0],
1314 : struct btrfs_dev_extent);
1315 0 : BUG_ON(found_key.offset > start || found_key.offset +
1316 : btrfs_dev_extent_length(leaf, extent) < start);
1317 0 : key = found_key;
1318 0 : btrfs_release_path(path);
1319 0 : goto again;
1320 116 : } else if (ret == 0) {
1321 116 : leaf = path->nodes[0];
1322 232 : extent = btrfs_item_ptr(leaf, path->slots[0],
1323 : struct btrfs_dev_extent);
1324 : } else {
1325 0 : btrfs_error(root->fs_info, ret, "Slot search failed");
1326 0 : goto out;
1327 : }
1328 :
1329 116 : if (device->bytes_used > 0) {
1330 : u64 len = btrfs_dev_extent_length(leaf, extent);
1331 116 : device->bytes_used -= len;
1332 116 : spin_lock(&root->fs_info->free_chunk_lock);
1333 116 : root->fs_info->free_chunk_space += len;
1334 116 : spin_unlock(&root->fs_info->free_chunk_lock);
1335 : }
1336 : ret = btrfs_del_item(trans, root, path);
1337 116 : if (ret) {
1338 0 : btrfs_error(root->fs_info, ret,
1339 : "Failed to remove dev extent item");
1340 : }
1341 : out:
1342 116 : btrfs_free_path(path);
1343 116 : return ret;
1344 : }
1345 :
1346 133 : static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
1347 : struct btrfs_device *device,
1348 : u64 chunk_tree, u64 chunk_objectid,
1349 : u64 chunk_offset, u64 start, u64 num_bytes)
1350 : {
1351 : int ret;
1352 : struct btrfs_path *path;
1353 133 : struct btrfs_root *root = device->dev_root;
1354 : struct btrfs_dev_extent *extent;
1355 : struct extent_buffer *leaf;
1356 : struct btrfs_key key;
1357 :
1358 133 : WARN_ON(!device->in_fs_metadata);
1359 133 : WARN_ON(device->is_tgtdev_for_dev_replace);
1360 133 : path = btrfs_alloc_path();
1361 133 : if (!path)
1362 : return -ENOMEM;
1363 :
1364 133 : key.objectid = device->devid;
1365 133 : key.offset = start;
1366 133 : key.type = BTRFS_DEV_EXTENT_KEY;
1367 : ret = btrfs_insert_empty_item(trans, root, path, &key,
1368 : sizeof(*extent));
1369 133 : if (ret)
1370 : goto out;
1371 :
1372 133 : leaf = path->nodes[0];
1373 266 : extent = btrfs_item_ptr(leaf, path->slots[0],
1374 : struct btrfs_dev_extent);
1375 : btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree);
1376 : btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid);
1377 : btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
1378 :
1379 133 : write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
1380 : btrfs_dev_extent_chunk_tree_uuid(extent), BTRFS_UUID_SIZE);
1381 :
1382 : btrfs_set_dev_extent_length(leaf, extent, num_bytes);
1383 133 : btrfs_mark_buffer_dirty(leaf);
1384 : out:
1385 133 : btrfs_free_path(path);
1386 133 : return ret;
1387 : }
1388 :
1389 87 : static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
1390 : {
1391 : struct extent_map_tree *em_tree;
1392 : struct extent_map *em;
1393 : struct rb_node *n;
1394 : u64 ret = 0;
1395 :
1396 : em_tree = &fs_info->mapping_tree.map_tree;
1397 87 : read_lock(&em_tree->lock);
1398 87 : n = rb_last(&em_tree->map);
1399 87 : if (n) {
1400 : em = rb_entry(n, struct extent_map, rb_node);
1401 87 : ret = em->start + em->len;
1402 : }
1403 : read_unlock(&em_tree->lock);
1404 :
1405 87 : return ret;
1406 : }
1407 :
1408 0 : static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
1409 : u64 *devid_ret)
1410 : {
1411 : int ret;
1412 : struct btrfs_key key;
1413 : struct btrfs_key found_key;
1414 : struct btrfs_path *path;
1415 :
1416 0 : path = btrfs_alloc_path();
1417 0 : if (!path)
1418 : return -ENOMEM;
1419 :
1420 0 : key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1421 0 : key.type = BTRFS_DEV_ITEM_KEY;
1422 0 : key.offset = (u64)-1;
1423 :
1424 0 : ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
1425 0 : if (ret < 0)
1426 : goto error;
1427 :
1428 0 : BUG_ON(ret == 0); /* Corruption */
1429 :
1430 0 : ret = btrfs_previous_item(fs_info->chunk_root, path,
1431 : BTRFS_DEV_ITEMS_OBJECTID,
1432 : BTRFS_DEV_ITEM_KEY);
1433 0 : if (ret) {
1434 0 : *devid_ret = 1;
1435 : } else {
1436 0 : btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1437 : path->slots[0]);
1438 0 : *devid_ret = found_key.offset + 1;
1439 : }
1440 : ret = 0;
1441 : error:
1442 0 : btrfs_free_path(path);
1443 : return ret;
1444 : }
1445 :
1446 : /*
1447 : * the device information is stored in the chunk root
1448 : * the btrfs_device struct should be fully filled in
1449 : */
1450 0 : static int btrfs_add_device(struct btrfs_trans_handle *trans,
1451 : struct btrfs_root *root,
1452 : struct btrfs_device *device)
1453 : {
1454 : int ret;
1455 : struct btrfs_path *path;
1456 : struct btrfs_dev_item *dev_item;
1457 : struct extent_buffer *leaf;
1458 : struct btrfs_key key;
1459 : unsigned long ptr;
1460 :
1461 0 : root = root->fs_info->chunk_root;
1462 :
1463 0 : path = btrfs_alloc_path();
1464 0 : if (!path)
1465 : return -ENOMEM;
1466 :
1467 0 : key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1468 0 : key.type = BTRFS_DEV_ITEM_KEY;
1469 0 : key.offset = device->devid;
1470 :
1471 : ret = btrfs_insert_empty_item(trans, root, path, &key,
1472 : sizeof(*dev_item));
1473 0 : if (ret)
1474 : goto out;
1475 :
1476 0 : leaf = path->nodes[0];
1477 0 : dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1478 :
1479 0 : btrfs_set_device_id(leaf, dev_item, device->devid);
1480 : btrfs_set_device_generation(leaf, dev_item, 0);
1481 0 : btrfs_set_device_type(leaf, dev_item, device->type);
1482 0 : btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1483 0 : btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1484 0 : btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1485 0 : btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes);
1486 0 : btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
1487 : btrfs_set_device_group(leaf, dev_item, 0);
1488 : btrfs_set_device_seek_speed(leaf, dev_item, 0);
1489 : btrfs_set_device_bandwidth(leaf, dev_item, 0);
1490 : btrfs_set_device_start_offset(leaf, dev_item, 0);
1491 :
1492 : ptr = btrfs_device_uuid(dev_item);
1493 0 : write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1494 : ptr = btrfs_device_fsid(dev_item);
1495 0 : write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE);
1496 0 : btrfs_mark_buffer_dirty(leaf);
1497 :
1498 : ret = 0;
1499 : out:
1500 0 : btrfs_free_path(path);
1501 : return ret;
1502 : }
1503 :
1504 : /*
1505 : * Function to update ctime/mtime for a given device path.
1506 : * Mainly used for ctime/mtime based probe like libblkid.
1507 : */
1508 0 : static void update_dev_time(char *path_name)
1509 : {
1510 : struct file *filp;
1511 :
1512 0 : filp = filp_open(path_name, O_RDWR, 0);
1513 0 : if (!filp)
1514 : return;
1515 0 : file_update_time(filp);
1516 0 : filp_close(filp, NULL);
1517 0 : return;
1518 : }
1519 :
1520 0 : static int btrfs_rm_dev_item(struct btrfs_root *root,
1521 : struct btrfs_device *device)
1522 : {
1523 : int ret;
1524 : struct btrfs_path *path;
1525 : struct btrfs_key key;
1526 : struct btrfs_trans_handle *trans;
1527 :
1528 0 : root = root->fs_info->chunk_root;
1529 :
1530 0 : path = btrfs_alloc_path();
1531 0 : if (!path)
1532 : return -ENOMEM;
1533 :
1534 0 : trans = btrfs_start_transaction(root, 0);
1535 0 : if (IS_ERR(trans)) {
1536 0 : btrfs_free_path(path);
1537 0 : return PTR_ERR(trans);
1538 : }
1539 0 : key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1540 0 : key.type = BTRFS_DEV_ITEM_KEY;
1541 0 : key.offset = device->devid;
1542 : lock_chunks(root);
1543 :
1544 0 : ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1545 0 : if (ret < 0)
1546 : goto out;
1547 :
1548 0 : if (ret > 0) {
1549 : ret = -ENOENT;
1550 : goto out;
1551 : }
1552 :
1553 : ret = btrfs_del_item(trans, root, path);
1554 : if (ret)
1555 : goto out;
1556 : out:
1557 0 : btrfs_free_path(path);
1558 : unlock_chunks(root);
1559 0 : btrfs_commit_transaction(trans, root);
1560 : return ret;
1561 : }
1562 :
1563 0 : int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1564 : {
1565 : struct btrfs_device *device;
1566 : struct btrfs_device *next_device;
1567 : struct block_device *bdev;
1568 0 : struct buffer_head *bh = NULL;
1569 : struct btrfs_super_block *disk_super;
1570 : struct btrfs_fs_devices *cur_devices;
1571 : u64 all_avail;
1572 : u64 devid;
1573 : u64 num_devices;
1574 : u8 *dev_uuid;
1575 : unsigned seq;
1576 : int ret = 0;
1577 : bool clear_super = false;
1578 :
1579 0 : mutex_lock(&uuid_mutex);
1580 :
1581 : do {
1582 0 : seq = read_seqbegin(&root->fs_info->profiles_lock);
1583 :
1584 0 : all_avail = root->fs_info->avail_data_alloc_bits |
1585 0 : root->fs_info->avail_system_alloc_bits |
1586 0 : root->fs_info->avail_metadata_alloc_bits;
1587 0 : } while (read_seqretry(&root->fs_info->profiles_lock, seq));
1588 :
1589 0 : num_devices = root->fs_info->fs_devices->num_devices;
1590 0 : btrfs_dev_replace_lock(&root->fs_info->dev_replace);
1591 0 : if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) {
1592 0 : WARN_ON(num_devices < 1);
1593 0 : num_devices--;
1594 : }
1595 0 : btrfs_dev_replace_unlock(&root->fs_info->dev_replace);
1596 :
1597 0 : if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) {
1598 : ret = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET;
1599 : goto out;
1600 : }
1601 :
1602 0 : if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) {
1603 : ret = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET;
1604 : goto out;
1605 : }
1606 :
1607 0 : if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) &&
1608 0 : root->fs_info->fs_devices->rw_devices <= 2) {
1609 : ret = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET;
1610 : goto out;
1611 : }
1612 0 : if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) &&
1613 0 : root->fs_info->fs_devices->rw_devices <= 3) {
1614 : ret = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET;
1615 : goto out;
1616 : }
1617 :
1618 0 : if (strcmp(device_path, "missing") == 0) {
1619 : struct list_head *devices;
1620 : struct btrfs_device *tmp;
1621 :
1622 : device = NULL;
1623 0 : devices = &root->fs_info->fs_devices->devices;
1624 : /*
1625 : * It is safe to read the devices since the volume_mutex
1626 : * is held.
1627 : */
1628 0 : list_for_each_entry(tmp, devices, dev_list) {
1629 0 : if (tmp->in_fs_metadata &&
1630 0 : !tmp->is_tgtdev_for_dev_replace &&
1631 0 : !tmp->bdev) {
1632 : device = tmp;
1633 : break;
1634 : }
1635 : }
1636 0 : bdev = NULL;
1637 0 : bh = NULL;
1638 : disk_super = NULL;
1639 0 : if (!device) {
1640 : ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
1641 : goto out;
1642 : }
1643 : } else {
1644 0 : ret = btrfs_get_bdev_and_sb(device_path,
1645 : FMODE_WRITE | FMODE_EXCL,
1646 0 : root->fs_info->bdev_holder, 0,
1647 : &bdev, &bh);
1648 0 : if (ret)
1649 : goto out;
1650 0 : disk_super = (struct btrfs_super_block *)bh->b_data;
1651 : devid = btrfs_stack_device_id(&disk_super->dev_item);
1652 0 : dev_uuid = disk_super->dev_item.uuid;
1653 0 : device = btrfs_find_device(root->fs_info, devid, dev_uuid,
1654 0 : disk_super->fsid);
1655 0 : if (!device) {
1656 : ret = -ENOENT;
1657 : goto error_brelse;
1658 : }
1659 : }
1660 :
1661 0 : if (device->is_tgtdev_for_dev_replace) {
1662 : ret = BTRFS_ERROR_DEV_TGT_REPLACE;
1663 : goto error_brelse;
1664 : }
1665 :
1666 0 : if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
1667 : ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
1668 : goto error_brelse;
1669 : }
1670 :
1671 0 : if (device->writeable) {
1672 : lock_chunks(root);
1673 0 : list_del_init(&device->dev_alloc_list);
1674 : unlock_chunks(root);
1675 0 : root->fs_info->fs_devices->rw_devices--;
1676 : clear_super = true;
1677 : }
1678 :
1679 0 : mutex_unlock(&uuid_mutex);
1680 0 : ret = btrfs_shrink_device(device, 0);
1681 0 : mutex_lock(&uuid_mutex);
1682 0 : if (ret)
1683 : goto error_undo;
1684 :
1685 : /*
1686 : * TODO: the superblock still includes this device in its num_devices
1687 : * counter although write_all_supers() is not locked out. This
1688 : * could give a filesystem state which requires a degraded mount.
1689 : */
1690 0 : ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
1691 0 : if (ret)
1692 : goto error_undo;
1693 :
1694 0 : spin_lock(&root->fs_info->free_chunk_lock);
1695 0 : root->fs_info->free_chunk_space = device->total_bytes -
1696 0 : device->bytes_used;
1697 0 : spin_unlock(&root->fs_info->free_chunk_lock);
1698 :
1699 0 : device->in_fs_metadata = 0;
1700 0 : btrfs_scrub_cancel_dev(root->fs_info, device);
1701 :
1702 : /*
1703 : * the device list mutex makes sure that we don't change
1704 : * the device list while someone else is writing out all
1705 : * the device supers. Whoever is writing all supers, should
1706 : * lock the device list mutex before getting the number of
1707 : * devices in the super block (super_copy). Conversely,
1708 : * whoever updates the number of devices in the super block
1709 : * (super_copy) should hold the device list mutex.
1710 : */
1711 :
1712 0 : cur_devices = device->fs_devices;
1713 0 : mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1714 0 : list_del_rcu(&device->dev_list);
1715 :
1716 0 : device->fs_devices->num_devices--;
1717 0 : device->fs_devices->total_devices--;
1718 :
1719 0 : if (device->missing)
1720 0 : device->fs_devices->missing_devices--;
1721 :
1722 0 : next_device = list_entry(root->fs_info->fs_devices->devices.next,
1723 : struct btrfs_device, dev_list);
1724 0 : if (device->bdev == root->fs_info->sb->s_bdev)
1725 0 : root->fs_info->sb->s_bdev = next_device->bdev;
1726 0 : if (device->bdev == root->fs_info->fs_devices->latest_bdev)
1727 0 : root->fs_info->fs_devices->latest_bdev = next_device->bdev;
1728 :
1729 0 : if (device->bdev) {
1730 0 : device->fs_devices->open_devices--;
1731 : /* remove sysfs entry */
1732 0 : btrfs_kobj_rm_device(root->fs_info, device);
1733 : }
1734 :
1735 0 : call_rcu(&device->rcu, free_device);
1736 :
1737 0 : num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
1738 : btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices);
1739 0 : mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1740 :
1741 0 : if (cur_devices->open_devices == 0) {
1742 : struct btrfs_fs_devices *fs_devices;
1743 0 : fs_devices = root->fs_info->fs_devices;
1744 0 : while (fs_devices) {
1745 0 : if (fs_devices->seed == cur_devices) {
1746 0 : fs_devices->seed = cur_devices->seed;
1747 0 : break;
1748 : }
1749 : fs_devices = fs_devices->seed;
1750 : }
1751 0 : cur_devices->seed = NULL;
1752 : lock_chunks(root);
1753 0 : __btrfs_close_devices(cur_devices);
1754 : unlock_chunks(root);
1755 0 : free_fs_devices(cur_devices);
1756 : }
1757 :
1758 0 : root->fs_info->num_tolerated_disk_barrier_failures =
1759 0 : btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
1760 :
1761 : /*
1762 : * at this point, the device is zero sized. We want to
1763 : * remove it from the devices list and zero out the old super
1764 : */
1765 0 : if (clear_super && disk_super) {
1766 : u64 bytenr;
1767 : int i;
1768 :
1769 : /* make sure this device isn't detected as part of
1770 : * the FS anymore
1771 : */
1772 0 : memset(&disk_super->magic, 0, sizeof(disk_super->magic));
1773 0 : set_buffer_dirty(bh);
1774 0 : sync_dirty_buffer(bh);
1775 :
1776 : /* clear the mirror copies of super block on the disk
1777 : * being removed, 0th copy is been taken care above and
1778 : * the below would take of the rest
1779 : */
1780 0 : for (i = 1; i < BTRFS_SUPER_MIRROR_MAX; i++) {
1781 : bytenr = btrfs_sb_offset(i);
1782 0 : if (bytenr + BTRFS_SUPER_INFO_SIZE >=
1783 0 : i_size_read(bdev->bd_inode))
1784 : break;
1785 :
1786 0 : brelse(bh);
1787 0 : bh = __bread(bdev, bytenr / 4096,
1788 : BTRFS_SUPER_INFO_SIZE);
1789 0 : if (!bh)
1790 0 : continue;
1791 :
1792 0 : disk_super = (struct btrfs_super_block *)bh->b_data;
1793 :
1794 0 : if (btrfs_super_bytenr(disk_super) != bytenr ||
1795 : btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
1796 0 : continue;
1797 : }
1798 0 : memset(&disk_super->magic, 0,
1799 : sizeof(disk_super->magic));
1800 0 : set_buffer_dirty(bh);
1801 0 : sync_dirty_buffer(bh);
1802 : }
1803 : }
1804 :
1805 : ret = 0;
1806 :
1807 0 : if (bdev) {
1808 : /* Notify udev that device has changed */
1809 0 : btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
1810 :
1811 : /* Update ctime/mtime for device path for libblkid */
1812 0 : update_dev_time(device_path);
1813 : }
1814 :
1815 : error_brelse:
1816 0 : brelse(bh);
1817 0 : if (bdev)
1818 0 : blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
1819 : out:
1820 0 : mutex_unlock(&uuid_mutex);
1821 0 : return ret;
1822 : error_undo:
1823 0 : if (device->writeable) {
1824 : lock_chunks(root);
1825 0 : list_add(&device->dev_alloc_list,
1826 0 : &root->fs_info->fs_devices->alloc_list);
1827 : unlock_chunks(root);
1828 0 : root->fs_info->fs_devices->rw_devices++;
1829 : }
1830 : goto error_brelse;
1831 : }
1832 :
1833 7 : void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
1834 : struct btrfs_device *srcdev)
1835 : {
1836 14 : WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex));
1837 :
1838 7 : list_del_rcu(&srcdev->dev_list);
1839 7 : list_del_rcu(&srcdev->dev_alloc_list);
1840 7 : fs_info->fs_devices->num_devices--;
1841 7 : if (srcdev->missing) {
1842 0 : fs_info->fs_devices->missing_devices--;
1843 0 : fs_info->fs_devices->rw_devices++;
1844 : }
1845 7 : if (srcdev->can_discard)
1846 0 : fs_info->fs_devices->num_can_discard--;
1847 7 : if (srcdev->bdev) {
1848 7 : fs_info->fs_devices->open_devices--;
1849 :
1850 : /*
1851 : * zero out the old super if it is not writable
1852 : * (e.g. seed device)
1853 : */
1854 7 : if (srcdev->writeable)
1855 7 : btrfs_scratch_superblock(srcdev);
1856 : }
1857 :
1858 7 : call_rcu(&srcdev->rcu, free_device);
1859 7 : }
1860 :
1861 1 : void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
1862 : struct btrfs_device *tgtdev)
1863 : {
1864 : struct btrfs_device *next_device;
1865 :
1866 1 : WARN_ON(!tgtdev);
1867 1 : mutex_lock(&fs_info->fs_devices->device_list_mutex);
1868 1 : if (tgtdev->bdev) {
1869 1 : btrfs_scratch_superblock(tgtdev);
1870 1 : fs_info->fs_devices->open_devices--;
1871 : }
1872 1 : fs_info->fs_devices->num_devices--;
1873 1 : if (tgtdev->can_discard)
1874 0 : fs_info->fs_devices->num_can_discard++;
1875 :
1876 1 : next_device = list_entry(fs_info->fs_devices->devices.next,
1877 : struct btrfs_device, dev_list);
1878 1 : if (tgtdev->bdev == fs_info->sb->s_bdev)
1879 0 : fs_info->sb->s_bdev = next_device->bdev;
1880 1 : if (tgtdev->bdev == fs_info->fs_devices->latest_bdev)
1881 0 : fs_info->fs_devices->latest_bdev = next_device->bdev;
1882 1 : list_del_rcu(&tgtdev->dev_list);
1883 :
1884 1 : call_rcu(&tgtdev->rcu, free_device);
1885 :
1886 1 : mutex_unlock(&fs_info->fs_devices->device_list_mutex);
1887 1 : }
1888 :
1889 8 : static int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path,
1890 : struct btrfs_device **device)
1891 : {
1892 : int ret = 0;
1893 : struct btrfs_super_block *disk_super;
1894 : u64 devid;
1895 : u8 *dev_uuid;
1896 : struct block_device *bdev;
1897 : struct buffer_head *bh;
1898 :
1899 8 : *device = NULL;
1900 8 : ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
1901 8 : root->fs_info->bdev_holder, 0, &bdev, &bh);
1902 8 : if (ret)
1903 : return ret;
1904 8 : disk_super = (struct btrfs_super_block *)bh->b_data;
1905 : devid = btrfs_stack_device_id(&disk_super->dev_item);
1906 8 : dev_uuid = disk_super->dev_item.uuid;
1907 8 : *device = btrfs_find_device(root->fs_info, devid, dev_uuid,
1908 8 : disk_super->fsid);
1909 : brelse(bh);
1910 8 : if (!*device)
1911 : ret = -ENOENT;
1912 8 : blkdev_put(bdev, FMODE_READ);
1913 : return ret;
1914 : }
1915 :
1916 8 : int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
1917 : char *device_path,
1918 : struct btrfs_device **device)
1919 : {
1920 8 : *device = NULL;
1921 8 : if (strcmp(device_path, "missing") == 0) {
1922 : struct list_head *devices;
1923 : struct btrfs_device *tmp;
1924 :
1925 0 : devices = &root->fs_info->fs_devices->devices;
1926 : /*
1927 : * It is safe to read the devices since the volume_mutex
1928 : * is held by the caller.
1929 : */
1930 0 : list_for_each_entry(tmp, devices, dev_list) {
1931 0 : if (tmp->in_fs_metadata && !tmp->bdev) {
1932 0 : *device = tmp;
1933 0 : break;
1934 : }
1935 : }
1936 :
1937 0 : if (!*device) {
1938 0 : btrfs_err(root->fs_info, "no missing device found");
1939 0 : return -ENOENT;
1940 : }
1941 :
1942 : return 0;
1943 : } else {
1944 8 : return btrfs_find_device_by_path(root, device_path, device);
1945 : }
1946 : }
1947 :
1948 : /*
1949 : * does all the dirty work required for changing file system's UUID.
1950 : */
1951 0 : static int btrfs_prepare_sprout(struct btrfs_root *root)
1952 : {
1953 0 : struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
1954 : struct btrfs_fs_devices *old_devices;
1955 : struct btrfs_fs_devices *seed_devices;
1956 0 : struct btrfs_super_block *disk_super = root->fs_info->super_copy;
1957 : struct btrfs_device *device;
1958 : u64 super_flags;
1959 :
1960 0 : BUG_ON(!mutex_is_locked(&uuid_mutex));
1961 0 : if (!fs_devices->seeding)
1962 : return -EINVAL;
1963 :
1964 0 : seed_devices = __alloc_fs_devices();
1965 0 : if (IS_ERR(seed_devices))
1966 0 : return PTR_ERR(seed_devices);
1967 :
1968 0 : old_devices = clone_fs_devices(fs_devices);
1969 0 : if (IS_ERR(old_devices)) {
1970 0 : kfree(seed_devices);
1971 0 : return PTR_ERR(old_devices);
1972 : }
1973 :
1974 0 : list_add(&old_devices->list, &fs_uuids);
1975 :
1976 0 : memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
1977 0 : seed_devices->opened = 1;
1978 0 : INIT_LIST_HEAD(&seed_devices->devices);
1979 0 : INIT_LIST_HEAD(&seed_devices->alloc_list);
1980 0 : mutex_init(&seed_devices->device_list_mutex);
1981 :
1982 0 : mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1983 0 : list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
1984 : synchronize_rcu);
1985 :
1986 0 : list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
1987 0 : list_for_each_entry(device, &seed_devices->devices, dev_list) {
1988 0 : device->fs_devices = seed_devices;
1989 : }
1990 :
1991 0 : fs_devices->seeding = 0;
1992 0 : fs_devices->num_devices = 0;
1993 0 : fs_devices->open_devices = 0;
1994 0 : fs_devices->missing_devices = 0;
1995 0 : fs_devices->num_can_discard = 0;
1996 0 : fs_devices->rotating = 0;
1997 0 : fs_devices->seed = seed_devices;
1998 :
1999 0 : generate_random_uuid(fs_devices->fsid);
2000 0 : memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2001 0 : memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2002 0 : mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2003 :
2004 0 : super_flags = btrfs_super_flags(disk_super) &
2005 : ~BTRFS_SUPER_FLAG_SEEDING;
2006 : btrfs_set_super_flags(disk_super, super_flags);
2007 :
2008 : return 0;
2009 : }
2010 :
2011 : /*
2012 : * strore the expected generation for seed devices in device items.
2013 : */
2014 0 : static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
2015 : struct btrfs_root *root)
2016 : {
2017 : struct btrfs_path *path;
2018 0 : struct extent_buffer *leaf;
2019 : struct btrfs_dev_item *dev_item;
2020 : struct btrfs_device *device;
2021 : struct btrfs_key key;
2022 : u8 fs_uuid[BTRFS_UUID_SIZE];
2023 : u8 dev_uuid[BTRFS_UUID_SIZE];
2024 : u64 devid;
2025 : int ret;
2026 :
2027 0 : path = btrfs_alloc_path();
2028 0 : if (!path)
2029 : return -ENOMEM;
2030 :
2031 0 : root = root->fs_info->chunk_root;
2032 0 : key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2033 0 : key.offset = 0;
2034 0 : key.type = BTRFS_DEV_ITEM_KEY;
2035 :
2036 : while (1) {
2037 0 : ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2038 0 : if (ret < 0)
2039 : goto error;
2040 :
2041 0 : leaf = path->nodes[0];
2042 : next_slot:
2043 0 : if (path->slots[0] >= btrfs_header_nritems(leaf)) {
2044 0 : ret = btrfs_next_leaf(root, path);
2045 0 : if (ret > 0)
2046 : break;
2047 0 : if (ret < 0)
2048 : goto error;
2049 0 : leaf = path->nodes[0];
2050 0 : btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2051 0 : btrfs_release_path(path);
2052 0 : continue;
2053 : }
2054 :
2055 0 : btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2056 0 : if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
2057 0 : key.type != BTRFS_DEV_ITEM_KEY)
2058 : break;
2059 :
2060 0 : dev_item = btrfs_item_ptr(leaf, path->slots[0],
2061 : struct btrfs_dev_item);
2062 : devid = btrfs_device_id(leaf, dev_item);
2063 0 : read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
2064 : BTRFS_UUID_SIZE);
2065 0 : read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
2066 : BTRFS_UUID_SIZE);
2067 0 : device = btrfs_find_device(root->fs_info, devid, dev_uuid,
2068 : fs_uuid);
2069 0 : BUG_ON(!device); /* Logic error */
2070 :
2071 0 : if (device->fs_devices->seeding) {
2072 0 : btrfs_set_device_generation(leaf, dev_item,
2073 : device->generation);
2074 0 : btrfs_mark_buffer_dirty(leaf);
2075 : }
2076 :
2077 0 : path->slots[0]++;
2078 : goto next_slot;
2079 : }
2080 : ret = 0;
2081 : error:
2082 0 : btrfs_free_path(path);
2083 : return ret;
2084 : }
2085 :
2086 0 : int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
2087 : {
2088 : struct request_queue *q;
2089 : struct btrfs_trans_handle *trans;
2090 : struct btrfs_device *device;
2091 0 : struct block_device *bdev;
2092 : struct list_head *devices;
2093 0 : struct super_block *sb = root->fs_info->sb;
2094 : struct rcu_string *name;
2095 : u64 total_bytes;
2096 : int seeding_dev = 0;
2097 : int ret = 0;
2098 :
2099 0 : if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
2100 : return -EROFS;
2101 :
2102 0 : bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2103 : root->fs_info->bdev_holder);
2104 0 : if (IS_ERR(bdev))
2105 0 : return PTR_ERR(bdev);
2106 :
2107 0 : if (root->fs_info->fs_devices->seeding) {
2108 : seeding_dev = 1;
2109 0 : down_write(&sb->s_umount);
2110 0 : mutex_lock(&uuid_mutex);
2111 : }
2112 :
2113 0 : filemap_write_and_wait(bdev->bd_inode->i_mapping);
2114 :
2115 0 : devices = &root->fs_info->fs_devices->devices;
2116 :
2117 0 : mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2118 0 : list_for_each_entry(device, devices, dev_list) {
2119 0 : if (device->bdev == bdev) {
2120 : ret = -EEXIST;
2121 0 : mutex_unlock(
2122 0 : &root->fs_info->fs_devices->device_list_mutex);
2123 0 : goto error;
2124 : }
2125 : }
2126 0 : mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2127 :
2128 0 : device = btrfs_alloc_device(root->fs_info, NULL, NULL);
2129 0 : if (IS_ERR(device)) {
2130 : /* we can safely leave the fs_devices entry around */
2131 0 : ret = PTR_ERR(device);
2132 0 : goto error;
2133 : }
2134 :
2135 0 : name = rcu_string_strdup(device_path, GFP_NOFS);
2136 0 : if (!name) {
2137 0 : kfree(device);
2138 : ret = -ENOMEM;
2139 0 : goto error;
2140 : }
2141 0 : rcu_assign_pointer(device->name, name);
2142 :
2143 0 : trans = btrfs_start_transaction(root, 0);
2144 0 : if (IS_ERR(trans)) {
2145 0 : rcu_string_free(device->name);
2146 0 : kfree(device);
2147 0 : ret = PTR_ERR(trans);
2148 0 : goto error;
2149 : }
2150 :
2151 : lock_chunks(root);
2152 :
2153 : q = bdev_get_queue(bdev);
2154 0 : if (blk_queue_discard(q))
2155 0 : device->can_discard = 1;
2156 0 : device->writeable = 1;
2157 0 : device->generation = trans->transid;
2158 0 : device->io_width = root->sectorsize;
2159 0 : device->io_align = root->sectorsize;
2160 0 : device->sector_size = root->sectorsize;
2161 0 : device->total_bytes = i_size_read(bdev->bd_inode);
2162 0 : device->disk_total_bytes = device->total_bytes;
2163 0 : device->dev_root = root->fs_info->dev_root;
2164 0 : device->bdev = bdev;
2165 0 : device->in_fs_metadata = 1;
2166 0 : device->is_tgtdev_for_dev_replace = 0;
2167 0 : device->mode = FMODE_EXCL;
2168 0 : device->dev_stats_valid = 1;
2169 0 : set_blocksize(device->bdev, 4096);
2170 :
2171 0 : if (seeding_dev) {
2172 0 : sb->s_flags &= ~MS_RDONLY;
2173 0 : ret = btrfs_prepare_sprout(root);
2174 0 : BUG_ON(ret); /* -ENOMEM */
2175 : }
2176 :
2177 0 : device->fs_devices = root->fs_info->fs_devices;
2178 :
2179 0 : mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2180 0 : list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices);
2181 0 : list_add(&device->dev_alloc_list,
2182 0 : &root->fs_info->fs_devices->alloc_list);
2183 0 : root->fs_info->fs_devices->num_devices++;
2184 0 : root->fs_info->fs_devices->open_devices++;
2185 0 : root->fs_info->fs_devices->rw_devices++;
2186 0 : root->fs_info->fs_devices->total_devices++;
2187 0 : if (device->can_discard)
2188 0 : root->fs_info->fs_devices->num_can_discard++;
2189 0 : root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
2190 :
2191 0 : spin_lock(&root->fs_info->free_chunk_lock);
2192 0 : root->fs_info->free_chunk_space += device->total_bytes;
2193 0 : spin_unlock(&root->fs_info->free_chunk_lock);
2194 :
2195 0 : if (!blk_queue_nonrot(bdev_get_queue(bdev)))
2196 0 : root->fs_info->fs_devices->rotating = 1;
2197 :
2198 0 : total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
2199 0 : btrfs_set_super_total_bytes(root->fs_info->super_copy,
2200 0 : total_bytes + device->total_bytes);
2201 :
2202 0 : total_bytes = btrfs_super_num_devices(root->fs_info->super_copy);
2203 0 : btrfs_set_super_num_devices(root->fs_info->super_copy,
2204 : total_bytes + 1);
2205 :
2206 : /* add sysfs device entry */
2207 0 : btrfs_kobj_add_device(root->fs_info, device);
2208 :
2209 0 : mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2210 :
2211 0 : if (seeding_dev) {
2212 : char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
2213 0 : ret = init_first_rw_device(trans, root, device);
2214 0 : if (ret) {
2215 0 : btrfs_abort_transaction(trans, root, ret);
2216 0 : goto error_trans;
2217 : }
2218 0 : ret = btrfs_finish_sprout(trans, root);
2219 0 : if (ret) {
2220 0 : btrfs_abort_transaction(trans, root, ret);
2221 0 : goto error_trans;
2222 : }
2223 :
2224 : /* Sprouting would change fsid of the mounted root,
2225 : * so rename the fsid on the sysfs
2226 : */
2227 0 : snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
2228 0 : root->fs_info->fsid);
2229 0 : if (kobject_rename(&root->fs_info->super_kobj, fsid_buf))
2230 : goto error_trans;
2231 : } else {
2232 0 : ret = btrfs_add_device(trans, root, device);
2233 0 : if (ret) {
2234 0 : btrfs_abort_transaction(trans, root, ret);
2235 0 : goto error_trans;
2236 : }
2237 : }
2238 :
2239 : /*
2240 : * we've got more storage, clear any full flags on the space
2241 : * infos
2242 : */
2243 0 : btrfs_clear_space_info_full(root->fs_info);
2244 :
2245 : unlock_chunks(root);
2246 0 : root->fs_info->num_tolerated_disk_barrier_failures =
2247 0 : btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
2248 0 : ret = btrfs_commit_transaction(trans, root);
2249 :
2250 0 : if (seeding_dev) {
2251 0 : mutex_unlock(&uuid_mutex);
2252 0 : up_write(&sb->s_umount);
2253 :
2254 0 : if (ret) /* transaction commit */
2255 : return ret;
2256 :
2257 0 : ret = btrfs_relocate_sys_chunks(root);
2258 0 : if (ret < 0)
2259 0 : btrfs_error(root->fs_info, ret,
2260 : "Failed to relocate sys chunks after "
2261 : "device initialization. This can be fixed "
2262 : "using the \"btrfs balance\" command.");
2263 0 : trans = btrfs_attach_transaction(root);
2264 0 : if (IS_ERR(trans)) {
2265 0 : if (PTR_ERR(trans) == -ENOENT)
2266 : return 0;
2267 0 : return PTR_ERR(trans);
2268 : }
2269 0 : ret = btrfs_commit_transaction(trans, root);
2270 : }
2271 :
2272 : /* Update ctime/mtime for libblkid */
2273 0 : update_dev_time(device_path);
2274 0 : return ret;
2275 :
2276 : error_trans:
2277 : unlock_chunks(root);
2278 0 : btrfs_end_transaction(trans, root);
2279 0 : rcu_string_free(device->name);
2280 0 : btrfs_kobj_rm_device(root->fs_info, device);
2281 0 : kfree(device);
2282 : error:
2283 0 : blkdev_put(bdev, FMODE_EXCL);
2284 0 : if (seeding_dev) {
2285 0 : mutex_unlock(&uuid_mutex);
2286 0 : up_write(&sb->s_umount);
2287 : }
2288 0 : return ret;
2289 : }
2290 :
2291 8 : int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
2292 : struct btrfs_device **device_out)
2293 : {
2294 : struct request_queue *q;
2295 : struct btrfs_device *device;
2296 8 : struct block_device *bdev;
2297 8 : struct btrfs_fs_info *fs_info = root->fs_info;
2298 : struct list_head *devices;
2299 : struct rcu_string *name;
2300 8 : u64 devid = BTRFS_DEV_REPLACE_DEVID;
2301 : int ret = 0;
2302 :
2303 8 : *device_out = NULL;
2304 8 : if (fs_info->fs_devices->seeding)
2305 : return -EINVAL;
2306 :
2307 8 : bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2308 : fs_info->bdev_holder);
2309 8 : if (IS_ERR(bdev))
2310 0 : return PTR_ERR(bdev);
2311 :
2312 16 : filemap_write_and_wait(bdev->bd_inode->i_mapping);
2313 :
2314 8 : devices = &fs_info->fs_devices->devices;
2315 19 : list_for_each_entry(device, devices, dev_list) {
2316 11 : if (device->bdev == bdev) {
2317 : ret = -EEXIST;
2318 : goto error;
2319 : }
2320 : }
2321 :
2322 8 : device = btrfs_alloc_device(NULL, &devid, NULL);
2323 8 : if (IS_ERR(device)) {
2324 0 : ret = PTR_ERR(device);
2325 0 : goto error;
2326 : }
2327 :
2328 8 : name = rcu_string_strdup(device_path, GFP_NOFS);
2329 8 : if (!name) {
2330 0 : kfree(device);
2331 : ret = -ENOMEM;
2332 0 : goto error;
2333 : }
2334 8 : rcu_assign_pointer(device->name, name);
2335 :
2336 : q = bdev_get_queue(bdev);
2337 8 : if (blk_queue_discard(q))
2338 0 : device->can_discard = 1;
2339 8 : mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2340 8 : device->writeable = 1;
2341 8 : device->generation = 0;
2342 8 : device->io_width = root->sectorsize;
2343 8 : device->io_align = root->sectorsize;
2344 8 : device->sector_size = root->sectorsize;
2345 16 : device->total_bytes = i_size_read(bdev->bd_inode);
2346 8 : device->disk_total_bytes = device->total_bytes;
2347 8 : device->dev_root = fs_info->dev_root;
2348 8 : device->bdev = bdev;
2349 8 : device->in_fs_metadata = 1;
2350 8 : device->is_tgtdev_for_dev_replace = 1;
2351 8 : device->mode = FMODE_EXCL;
2352 8 : device->dev_stats_valid = 1;
2353 8 : set_blocksize(device->bdev, 4096);
2354 8 : device->fs_devices = fs_info->fs_devices;
2355 8 : list_add(&device->dev_list, &fs_info->fs_devices->devices);
2356 8 : fs_info->fs_devices->num_devices++;
2357 8 : fs_info->fs_devices->open_devices++;
2358 8 : if (device->can_discard)
2359 0 : fs_info->fs_devices->num_can_discard++;
2360 8 : mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2361 :
2362 8 : *device_out = device;
2363 8 : return ret;
2364 :
2365 : error:
2366 0 : blkdev_put(bdev, FMODE_EXCL);
2367 0 : return ret;
2368 : }
2369 :
2370 0 : void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
2371 : struct btrfs_device *tgtdev)
2372 : {
2373 0 : WARN_ON(fs_info->fs_devices->rw_devices == 0);
2374 0 : tgtdev->io_width = fs_info->dev_root->sectorsize;
2375 0 : tgtdev->io_align = fs_info->dev_root->sectorsize;
2376 0 : tgtdev->sector_size = fs_info->dev_root->sectorsize;
2377 0 : tgtdev->dev_root = fs_info->dev_root;
2378 0 : tgtdev->in_fs_metadata = 1;
2379 0 : }
2380 :
2381 249 : static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
2382 : struct btrfs_device *device)
2383 : {
2384 : int ret;
2385 : struct btrfs_path *path;
2386 : struct btrfs_root *root;
2387 : struct btrfs_dev_item *dev_item;
2388 : struct extent_buffer *leaf;
2389 : struct btrfs_key key;
2390 :
2391 249 : root = device->dev_root->fs_info->chunk_root;
2392 :
2393 249 : path = btrfs_alloc_path();
2394 249 : if (!path)
2395 : return -ENOMEM;
2396 :
2397 249 : key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2398 249 : key.type = BTRFS_DEV_ITEM_KEY;
2399 249 : key.offset = device->devid;
2400 :
2401 249 : ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2402 249 : if (ret < 0)
2403 : goto out;
2404 :
2405 249 : if (ret > 0) {
2406 : ret = -ENOENT;
2407 : goto out;
2408 : }
2409 :
2410 249 : leaf = path->nodes[0];
2411 498 : dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
2412 :
2413 249 : btrfs_set_device_id(leaf, dev_item, device->devid);
2414 249 : btrfs_set_device_type(leaf, dev_item, device->type);
2415 249 : btrfs_set_device_io_align(leaf, dev_item, device->io_align);
2416 249 : btrfs_set_device_io_width(leaf, dev_item, device->io_width);
2417 249 : btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
2418 249 : btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes);
2419 249 : btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
2420 249 : btrfs_mark_buffer_dirty(leaf);
2421 :
2422 : out:
2423 249 : btrfs_free_path(path);
2424 249 : return ret;
2425 : }
2426 :
2427 0 : static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
2428 : struct btrfs_device *device, u64 new_size)
2429 : {
2430 0 : struct btrfs_super_block *super_copy =
2431 0 : device->dev_root->fs_info->super_copy;
2432 : u64 old_total = btrfs_super_total_bytes(super_copy);
2433 0 : u64 diff = new_size - device->total_bytes;
2434 :
2435 0 : if (!device->writeable)
2436 : return -EACCES;
2437 0 : if (new_size <= device->total_bytes ||
2438 0 : device->is_tgtdev_for_dev_replace)
2439 : return -EINVAL;
2440 :
2441 0 : btrfs_set_super_total_bytes(super_copy, old_total + diff);
2442 0 : device->fs_devices->total_rw_bytes += diff;
2443 :
2444 0 : device->total_bytes = new_size;
2445 0 : device->disk_total_bytes = new_size;
2446 0 : btrfs_clear_space_info_full(device->dev_root->fs_info);
2447 :
2448 0 : return btrfs_update_device(trans, device);
2449 : }
2450 :
2451 0 : int btrfs_grow_device(struct btrfs_trans_handle *trans,
2452 : struct btrfs_device *device, u64 new_size)
2453 : {
2454 : int ret;
2455 0 : lock_chunks(device->dev_root);
2456 0 : ret = __btrfs_grow_device(trans, device, new_size);
2457 0 : unlock_chunks(device->dev_root);
2458 0 : return ret;
2459 : }
2460 :
2461 72 : static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
2462 : struct btrfs_root *root,
2463 : u64 chunk_tree, u64 chunk_objectid,
2464 : u64 chunk_offset)
2465 : {
2466 : int ret;
2467 : struct btrfs_path *path;
2468 : struct btrfs_key key;
2469 :
2470 72 : root = root->fs_info->chunk_root;
2471 72 : path = btrfs_alloc_path();
2472 72 : if (!path)
2473 : return -ENOMEM;
2474 :
2475 72 : key.objectid = chunk_objectid;
2476 72 : key.offset = chunk_offset;
2477 72 : key.type = BTRFS_CHUNK_ITEM_KEY;
2478 :
2479 72 : ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2480 72 : if (ret < 0)
2481 : goto out;
2482 72 : else if (ret > 0) { /* Logic error or corruption */
2483 0 : btrfs_error(root->fs_info, -ENOENT,
2484 : "Failed lookup while freeing chunk.");
2485 : ret = -ENOENT;
2486 : goto out;
2487 : }
2488 :
2489 : ret = btrfs_del_item(trans, root, path);
2490 72 : if (ret < 0)
2491 0 : btrfs_error(root->fs_info, ret,
2492 : "Failed to delete chunk item.");
2493 : out:
2494 72 : btrfs_free_path(path);
2495 : return ret;
2496 : }
2497 :
2498 25 : static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
2499 : chunk_offset)
2500 : {
2501 25 : struct btrfs_super_block *super_copy = root->fs_info->super_copy;
2502 : struct btrfs_disk_key *disk_key;
2503 : struct btrfs_chunk *chunk;
2504 : u8 *ptr;
2505 : int ret = 0;
2506 : u32 num_stripes;
2507 : u32 array_size;
2508 : u32 len = 0;
2509 : u32 cur;
2510 : struct btrfs_key key;
2511 :
2512 : array_size = btrfs_super_sys_array_size(super_copy);
2513 :
2514 25 : ptr = super_copy->sys_chunk_array;
2515 : cur = 0;
2516 :
2517 78 : while (cur < array_size) {
2518 : disk_key = (struct btrfs_disk_key *)ptr;
2519 : btrfs_disk_key_to_cpu(&key, disk_key);
2520 :
2521 : len = sizeof(*disk_key);
2522 :
2523 53 : if (key.type == BTRFS_CHUNK_ITEM_KEY) {
2524 : chunk = (struct btrfs_chunk *)(ptr + len);
2525 : num_stripes = btrfs_stack_chunk_num_stripes(chunk);
2526 106 : len += btrfs_chunk_item_size(num_stripes);
2527 : } else {
2528 : ret = -EIO;
2529 : break;
2530 : }
2531 53 : if (key.objectid == chunk_objectid &&
2532 : key.offset == chunk_offset) {
2533 25 : memmove(ptr, ptr + len, array_size - (cur + len));
2534 25 : array_size -= len;
2535 : btrfs_set_super_sys_array_size(super_copy, array_size);
2536 : } else {
2537 28 : ptr += len;
2538 28 : cur += len;
2539 : }
2540 : }
2541 25 : return ret;
2542 : }
2543 :
2544 313 : static int btrfs_relocate_chunk(struct btrfs_root *root,
2545 : u64 chunk_tree, u64 chunk_objectid,
2546 : u64 chunk_offset)
2547 : {
2548 : struct extent_map_tree *em_tree;
2549 : struct btrfs_root *extent_root;
2550 : struct btrfs_trans_handle *trans;
2551 : struct extent_map *em;
2552 : struct map_lookup *map;
2553 : int ret;
2554 : int i;
2555 :
2556 72 : root = root->fs_info->chunk_root;
2557 72 : extent_root = root->fs_info->extent_root;
2558 72 : em_tree = &root->fs_info->mapping_tree.map_tree;
2559 :
2560 72 : ret = btrfs_can_relocate(extent_root, chunk_offset);
2561 72 : if (ret)
2562 : return -ENOSPC;
2563 :
2564 : /* step one, relocate all the extents inside this chunk */
2565 72 : ret = btrfs_relocate_block_group(extent_root, chunk_offset);
2566 72 : if (ret)
2567 : return ret;
2568 :
2569 72 : trans = btrfs_start_transaction(root, 0);
2570 72 : if (IS_ERR(trans)) {
2571 0 : ret = PTR_ERR(trans);
2572 0 : btrfs_std_error(root->fs_info, ret);
2573 : return ret;
2574 : }
2575 :
2576 : lock_chunks(root);
2577 :
2578 : /*
2579 : * step two, delete the device extents and the
2580 : * chunk tree entries
2581 : */
2582 72 : read_lock(&em_tree->lock);
2583 72 : em = lookup_extent_mapping(em_tree, chunk_offset, 1);
2584 : read_unlock(&em_tree->lock);
2585 :
2586 72 : BUG_ON(!em || em->start > chunk_offset ||
2587 : em->start + em->len < chunk_offset);
2588 72 : map = (struct map_lookup *)em->bdev;
2589 :
2590 116 : for (i = 0; i < map->num_stripes; i++) {
2591 116 : ret = btrfs_free_dev_extent(trans, map->stripes[i].dev,
2592 : map->stripes[i].physical);
2593 116 : BUG_ON(ret);
2594 :
2595 116 : if (map->stripes[i].dev) {
2596 116 : ret = btrfs_update_device(trans, map->stripes[i].dev);
2597 116 : BUG_ON(ret);
2598 : }
2599 : }
2600 72 : ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
2601 : chunk_offset);
2602 :
2603 72 : BUG_ON(ret);
2604 :
2605 72 : trace_btrfs_chunk_free(root, map, chunk_offset, em->len);
2606 :
2607 72 : if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2608 25 : ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
2609 25 : BUG_ON(ret);
2610 : }
2611 :
2612 72 : ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
2613 72 : BUG_ON(ret);
2614 :
2615 72 : write_lock(&em_tree->lock);
2616 72 : remove_extent_mapping(em_tree, em);
2617 : write_unlock(&em_tree->lock);
2618 :
2619 : /* once for the tree */
2620 72 : free_extent_map(em);
2621 : /* once for us */
2622 72 : free_extent_map(em);
2623 :
2624 : unlock_chunks(root);
2625 72 : btrfs_end_transaction(trans, root);
2626 : return 0;
2627 : }
2628 :
2629 0 : static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
2630 : {
2631 0 : struct btrfs_root *chunk_root = root->fs_info->chunk_root;
2632 : struct btrfs_path *path;
2633 : struct extent_buffer *leaf;
2634 : struct btrfs_chunk *chunk;
2635 : struct btrfs_key key;
2636 : struct btrfs_key found_key;
2637 : u64 chunk_tree = chunk_root->root_key.objectid;
2638 : u64 chunk_type;
2639 : bool retried = false;
2640 : int failed = 0;
2641 : int ret;
2642 :
2643 0 : path = btrfs_alloc_path();
2644 0 : if (!path)
2645 : return -ENOMEM;
2646 :
2647 : again:
2648 0 : key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2649 0 : key.offset = (u64)-1;
2650 0 : key.type = BTRFS_CHUNK_ITEM_KEY;
2651 :
2652 : while (1) {
2653 0 : ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
2654 0 : if (ret < 0)
2655 : goto error;
2656 0 : BUG_ON(ret == 0); /* Corruption */
2657 :
2658 0 : ret = btrfs_previous_item(chunk_root, path, key.objectid,
2659 0 : key.type);
2660 0 : if (ret < 0)
2661 : goto error;
2662 0 : if (ret > 0)
2663 : break;
2664 :
2665 0 : leaf = path->nodes[0];
2666 0 : btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2667 :
2668 0 : chunk = btrfs_item_ptr(leaf, path->slots[0],
2669 : struct btrfs_chunk);
2670 : chunk_type = btrfs_chunk_type(leaf, chunk);
2671 0 : btrfs_release_path(path);
2672 :
2673 0 : if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
2674 0 : ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
2675 : found_key.objectid,
2676 : found_key.offset);
2677 0 : if (ret == -ENOSPC)
2678 0 : failed++;
2679 0 : else if (ret)
2680 0 : BUG();
2681 : }
2682 :
2683 0 : if (found_key.offset == 0)
2684 : break;
2685 0 : key.offset = found_key.offset - 1;
2686 : }
2687 : ret = 0;
2688 0 : if (failed && !retried) {
2689 : failed = 0;
2690 : retried = true;
2691 : goto again;
2692 0 : } else if (WARN_ON(failed && retried)) {
2693 : ret = -ENOSPC;
2694 : }
2695 : error:
2696 0 : btrfs_free_path(path);
2697 : return ret;
2698 : }
2699 :
2700 22 : static int insert_balance_item(struct btrfs_root *root,
2701 : struct btrfs_balance_control *bctl)
2702 : {
2703 : struct btrfs_trans_handle *trans;
2704 : struct btrfs_balance_item *item;
2705 : struct btrfs_disk_balance_args disk_bargs;
2706 : struct btrfs_path *path;
2707 : struct extent_buffer *leaf;
2708 : struct btrfs_key key;
2709 : int ret, err;
2710 :
2711 22 : path = btrfs_alloc_path();
2712 22 : if (!path)
2713 : return -ENOMEM;
2714 :
2715 22 : trans = btrfs_start_transaction(root, 0);
2716 22 : if (IS_ERR(trans)) {
2717 0 : btrfs_free_path(path);
2718 0 : return PTR_ERR(trans);
2719 : }
2720 :
2721 22 : key.objectid = BTRFS_BALANCE_OBJECTID;
2722 22 : key.type = BTRFS_BALANCE_ITEM_KEY;
2723 22 : key.offset = 0;
2724 :
2725 : ret = btrfs_insert_empty_item(trans, root, path, &key,
2726 : sizeof(*item));
2727 22 : if (ret)
2728 : goto out;
2729 :
2730 22 : leaf = path->nodes[0];
2731 44 : item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
2732 :
2733 22 : memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
2734 :
2735 22 : btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
2736 : btrfs_set_balance_data(leaf, item, &disk_bargs);
2737 22 : btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
2738 : btrfs_set_balance_meta(leaf, item, &disk_bargs);
2739 22 : btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
2740 : btrfs_set_balance_sys(leaf, item, &disk_bargs);
2741 :
2742 22 : btrfs_set_balance_flags(leaf, item, bctl->flags);
2743 :
2744 22 : btrfs_mark_buffer_dirty(leaf);
2745 : out:
2746 22 : btrfs_free_path(path);
2747 22 : err = btrfs_commit_transaction(trans, root);
2748 22 : if (err && !ret)
2749 : ret = err;
2750 22 : return ret;
2751 : }
2752 :
2753 22 : static int del_balance_item(struct btrfs_root *root)
2754 : {
2755 : struct btrfs_trans_handle *trans;
2756 : struct btrfs_path *path;
2757 : struct btrfs_key key;
2758 : int ret, err;
2759 :
2760 22 : path = btrfs_alloc_path();
2761 22 : if (!path)
2762 : return -ENOMEM;
2763 :
2764 22 : trans = btrfs_start_transaction(root, 0);
2765 22 : if (IS_ERR(trans)) {
2766 0 : btrfs_free_path(path);
2767 0 : return PTR_ERR(trans);
2768 : }
2769 :
2770 22 : key.objectid = BTRFS_BALANCE_OBJECTID;
2771 22 : key.type = BTRFS_BALANCE_ITEM_KEY;
2772 22 : key.offset = 0;
2773 :
2774 22 : ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2775 22 : if (ret < 0)
2776 : goto out;
2777 22 : if (ret > 0) {
2778 : ret = -ENOENT;
2779 : goto out;
2780 : }
2781 :
2782 : ret = btrfs_del_item(trans, root, path);
2783 : out:
2784 22 : btrfs_free_path(path);
2785 22 : err = btrfs_commit_transaction(trans, root);
2786 22 : if (err && !ret)
2787 : ret = err;
2788 22 : return ret;
2789 : }
2790 :
2791 : /*
2792 : * This is a heuristic used to reduce the number of chunks balanced on
2793 : * resume after balance was interrupted.
2794 : */
2795 0 : static void update_balance_args(struct btrfs_balance_control *bctl)
2796 : {
2797 : /*
2798 : * Turn on soft mode for chunk types that were being converted.
2799 : */
2800 0 : if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
2801 0 : bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
2802 0 : if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
2803 0 : bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
2804 0 : if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
2805 0 : bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
2806 :
2807 : /*
2808 : * Turn on usage filter if is not already used. The idea is
2809 : * that chunks that we have already balanced should be
2810 : * reasonably full. Don't do it for chunks that are being
2811 : * converted - that will keep us from relocating unconverted
2812 : * (albeit full) chunks.
2813 : */
2814 0 : if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2815 : !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2816 0 : bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
2817 0 : bctl->data.usage = 90;
2818 : }
2819 0 : if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2820 : !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2821 0 : bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
2822 0 : bctl->sys.usage = 90;
2823 : }
2824 0 : if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2825 : !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2826 0 : bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
2827 0 : bctl->meta.usage = 90;
2828 : }
2829 0 : }
2830 :
2831 : /*
2832 : * Should be called with both balance and volume mutexes held to
2833 : * serialize other volume operations (add_dev/rm_dev/resize) with
2834 : * restriper. Same goes for unset_balance_control.
2835 : */
2836 22 : static void set_balance_control(struct btrfs_balance_control *bctl)
2837 : {
2838 22 : struct btrfs_fs_info *fs_info = bctl->fs_info;
2839 :
2840 22 : BUG_ON(fs_info->balance_ctl);
2841 :
2842 : spin_lock(&fs_info->balance_lock);
2843 22 : fs_info->balance_ctl = bctl;
2844 : spin_unlock(&fs_info->balance_lock);
2845 22 : }
2846 :
2847 22 : static void unset_balance_control(struct btrfs_fs_info *fs_info)
2848 : {
2849 22 : struct btrfs_balance_control *bctl = fs_info->balance_ctl;
2850 :
2851 22 : BUG_ON(!fs_info->balance_ctl);
2852 :
2853 : spin_lock(&fs_info->balance_lock);
2854 22 : fs_info->balance_ctl = NULL;
2855 : spin_unlock(&fs_info->balance_lock);
2856 :
2857 22 : kfree(bctl);
2858 22 : }
2859 :
2860 : /*
2861 : * Balance filters. Return 1 if chunk should be filtered out
2862 : * (should not be balanced).
2863 : */
2864 : static int chunk_profiles_filter(u64 chunk_type,
2865 : struct btrfs_balance_args *bargs)
2866 : {
2867 0 : chunk_type = chunk_to_extended(chunk_type) &
2868 : BTRFS_EXTENDED_PROFILE_MASK;
2869 :
2870 0 : if (bargs->profiles & chunk_type)
2871 : return 0;
2872 :
2873 : return 1;
2874 : }
2875 :
2876 0 : static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
2877 : struct btrfs_balance_args *bargs)
2878 : {
2879 : struct btrfs_block_group_cache *cache;
2880 : u64 chunk_used, user_thresh;
2881 : int ret = 1;
2882 :
2883 0 : cache = btrfs_lookup_block_group(fs_info, chunk_offset);
2884 : chunk_used = btrfs_block_group_used(&cache->item);
2885 :
2886 0 : if (bargs->usage == 0)
2887 : user_thresh = 1;
2888 0 : else if (bargs->usage > 100)
2889 0 : user_thresh = cache->key.offset;
2890 : else
2891 0 : user_thresh = div_factor_fine(cache->key.offset,
2892 : bargs->usage);
2893 :
2894 0 : if (chunk_used < user_thresh)
2895 : ret = 0;
2896 :
2897 0 : btrfs_put_block_group(cache);
2898 0 : return ret;
2899 : }
2900 :
2901 0 : static int chunk_devid_filter(struct extent_buffer *leaf,
2902 : struct btrfs_chunk *chunk,
2903 : struct btrfs_balance_args *bargs)
2904 : {
2905 : struct btrfs_stripe *stripe;
2906 0 : int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
2907 : int i;
2908 :
2909 0 : for (i = 0; i < num_stripes; i++) {
2910 : stripe = btrfs_stripe_nr(chunk, i);
2911 0 : if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
2912 : return 0;
2913 : }
2914 :
2915 : return 1;
2916 : }
2917 :
2918 : /* [pstart, pend) */
2919 0 : static int chunk_drange_filter(struct extent_buffer *leaf,
2920 : struct btrfs_chunk *chunk,
2921 : u64 chunk_offset,
2922 : struct btrfs_balance_args *bargs)
2923 : {
2924 : struct btrfs_stripe *stripe;
2925 0 : int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
2926 : u64 stripe_offset;
2927 : u64 stripe_length;
2928 : int factor;
2929 : int i;
2930 :
2931 0 : if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
2932 : return 0;
2933 :
2934 0 : if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
2935 : BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) {
2936 0 : factor = num_stripes / 2;
2937 0 : } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) {
2938 0 : factor = num_stripes - 1;
2939 0 : } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) {
2940 0 : factor = num_stripes - 2;
2941 : } else {
2942 : factor = num_stripes;
2943 : }
2944 :
2945 0 : for (i = 0; i < num_stripes; i++) {
2946 : stripe = btrfs_stripe_nr(chunk, i);
2947 0 : if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
2948 0 : continue;
2949 :
2950 : stripe_offset = btrfs_stripe_offset(leaf, stripe);
2951 : stripe_length = btrfs_chunk_length(leaf, chunk);
2952 0 : do_div(stripe_length, factor);
2953 :
2954 0 : if (stripe_offset < bargs->pend &&
2955 0 : stripe_offset + stripe_length > bargs->pstart)
2956 : return 0;
2957 : }
2958 :
2959 : return 1;
2960 : }
2961 :
2962 : /* [vstart, vend) */
2963 0 : static int chunk_vrange_filter(struct extent_buffer *leaf,
2964 : struct btrfs_chunk *chunk,
2965 : u64 chunk_offset,
2966 : struct btrfs_balance_args *bargs)
2967 : {
2968 0 : if (chunk_offset < bargs->vend &&
2969 0 : chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
2970 : /* at least part of the chunk is inside this vrange */
2971 : return 0;
2972 :
2973 : return 1;
2974 : }
2975 :
2976 : static int chunk_soft_convert_filter(u64 chunk_type,
2977 : struct btrfs_balance_args *bargs)
2978 : {
2979 0 : if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
2980 : return 0;
2981 :
2982 0 : chunk_type = chunk_to_extended(chunk_type) &
2983 : BTRFS_EXTENDED_PROFILE_MASK;
2984 :
2985 0 : if (bargs->target == chunk_type)
2986 : return 1;
2987 :
2988 : return 0;
2989 : }
2990 :
2991 144 : static int should_balance_chunk(struct btrfs_root *root,
2992 : struct extent_buffer *leaf,
2993 : struct btrfs_chunk *chunk, u64 chunk_offset)
2994 : {
2995 144 : struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
2996 : struct btrfs_balance_args *bargs = NULL;
2997 : u64 chunk_type = btrfs_chunk_type(leaf, chunk);
2998 :
2999 : /* type filter */
3000 144 : if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
3001 144 : (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
3002 : return 0;
3003 : }
3004 :
3005 144 : if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
3006 44 : bargs = &bctl->data;
3007 100 : else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
3008 50 : bargs = &bctl->sys;
3009 50 : else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
3010 50 : bargs = &bctl->meta;
3011 :
3012 : /* profiles filter */
3013 144 : if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
3014 : chunk_profiles_filter(chunk_type, bargs)) {
3015 : return 0;
3016 : }
3017 :
3018 : /* usage filter */
3019 144 : if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
3020 0 : chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) {
3021 : return 0;
3022 : }
3023 :
3024 : /* devid filter */
3025 144 : if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
3026 0 : chunk_devid_filter(leaf, chunk, bargs)) {
3027 : return 0;
3028 : }
3029 :
3030 : /* drange filter, makes sense only with devid filter */
3031 144 : if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
3032 0 : chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) {
3033 : return 0;
3034 : }
3035 :
3036 : /* vrange filter */
3037 144 : if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
3038 0 : chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
3039 : return 0;
3040 : }
3041 :
3042 : /* soft profile changing mode */
3043 144 : if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
3044 : chunk_soft_convert_filter(chunk_type, bargs)) {
3045 : return 0;
3046 : }
3047 :
3048 : /*
3049 : * limited by count, must be the last filter
3050 : */
3051 144 : if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
3052 0 : if (bargs->limit == 0)
3053 : return 0;
3054 : else
3055 0 : bargs->limit--;
3056 : }
3057 :
3058 : return 1;
3059 : }
3060 :
3061 22 : static int __btrfs_balance(struct btrfs_fs_info *fs_info)
3062 : {
3063 22 : struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3064 238 : struct btrfs_root *chunk_root = fs_info->chunk_root;
3065 22 : struct btrfs_root *dev_root = fs_info->dev_root;
3066 : struct list_head *devices;
3067 : struct btrfs_device *device;
3068 : u64 old_size;
3069 : u64 size_to_free;
3070 : struct btrfs_chunk *chunk;
3071 : struct btrfs_path *path;
3072 : struct btrfs_key key;
3073 : struct btrfs_key found_key;
3074 : struct btrfs_trans_handle *trans;
3075 : struct extent_buffer *leaf;
3076 : int slot;
3077 : int ret;
3078 : int enospc_errors = 0;
3079 : bool counting = true;
3080 22 : u64 limit_data = bctl->data.limit;
3081 22 : u64 limit_meta = bctl->meta.limit;
3082 22 : u64 limit_sys = bctl->sys.limit;
3083 :
3084 : /* step one make some room on all the devices */
3085 22 : devices = &fs_info->fs_devices->devices;
3086 44 : list_for_each_entry(device, devices, dev_list) {
3087 22 : old_size = device->total_bytes;
3088 : size_to_free = div_factor(old_size, 1);
3089 22 : size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
3090 44 : if (!device->writeable ||
3091 22 : device->total_bytes - device->bytes_used > size_to_free ||
3092 0 : device->is_tgtdev_for_dev_replace)
3093 22 : continue;
3094 :
3095 0 : ret = btrfs_shrink_device(device, old_size - size_to_free);
3096 0 : if (ret == -ENOSPC)
3097 : break;
3098 0 : BUG_ON(ret);
3099 :
3100 0 : trans = btrfs_start_transaction(dev_root, 0);
3101 0 : BUG_ON(IS_ERR(trans));
3102 :
3103 0 : ret = btrfs_grow_device(trans, device, old_size);
3104 0 : BUG_ON(ret);
3105 :
3106 0 : btrfs_end_transaction(trans, dev_root);
3107 : }
3108 :
3109 : /* step two, relocate all the chunks */
3110 22 : path = btrfs_alloc_path();
3111 22 : if (!path) {
3112 : ret = -ENOMEM;
3113 : goto error;
3114 : }
3115 :
3116 : /* zero out stat counters */
3117 : spin_lock(&fs_info->balance_lock);
3118 22 : memset(&bctl->stat, 0, sizeof(bctl->stat));
3119 : spin_unlock(&fs_info->balance_lock);
3120 : again:
3121 44 : if (!counting) {
3122 22 : bctl->data.limit = limit_data;
3123 22 : bctl->meta.limit = limit_meta;
3124 22 : bctl->sys.limit = limit_sys;
3125 : }
3126 44 : key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3127 44 : key.offset = (u64)-1;
3128 44 : key.type = BTRFS_CHUNK_ITEM_KEY;
3129 :
3130 : while (1) {
3131 455 : if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
3132 : atomic_read(&fs_info->balance_cancel_req)) {
3133 : ret = -ECANCELED;
3134 : goto error;
3135 : }
3136 :
3137 182 : ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
3138 182 : if (ret < 0)
3139 : goto error;
3140 :
3141 : /*
3142 : * this shouldn't happen, it means the last relocate
3143 : * failed
3144 : */
3145 182 : if (ret == 0)
3146 0 : BUG(); /* FIXME break ? */
3147 :
3148 182 : ret = btrfs_previous_item(chunk_root, path, 0,
3149 : BTRFS_CHUNK_ITEM_KEY);
3150 182 : if (ret) {
3151 : ret = 0;
3152 : break;
3153 : }
3154 :
3155 144 : leaf = path->nodes[0];
3156 144 : slot = path->slots[0];
3157 144 : btrfs_item_key_to_cpu(leaf, &found_key, slot);
3158 :
3159 144 : if (found_key.objectid != key.objectid)
3160 : break;
3161 :
3162 144 : chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
3163 :
3164 144 : if (!counting) {
3165 : spin_lock(&fs_info->balance_lock);
3166 72 : bctl->stat.considered++;
3167 : spin_unlock(&fs_info->balance_lock);
3168 : }
3169 :
3170 288 : ret = should_balance_chunk(chunk_root, leaf, chunk,
3171 : found_key.offset);
3172 144 : btrfs_release_path(path);
3173 144 : if (!ret)
3174 : goto loop;
3175 :
3176 144 : if (counting) {
3177 : spin_lock(&fs_info->balance_lock);
3178 72 : bctl->stat.expected++;
3179 : spin_unlock(&fs_info->balance_lock);
3180 : goto loop;
3181 : }
3182 :
3183 144 : ret = btrfs_relocate_chunk(chunk_root,
3184 : chunk_root->root_key.objectid,
3185 : found_key.objectid,
3186 : found_key.offset);
3187 72 : if (ret && ret != -ENOSPC)
3188 : goto error;
3189 72 : if (ret == -ENOSPC) {
3190 0 : enospc_errors++;
3191 : } else {
3192 : spin_lock(&fs_info->balance_lock);
3193 72 : bctl->stat.completed++;
3194 : spin_unlock(&fs_info->balance_lock);
3195 : }
3196 : loop:
3197 144 : if (found_key.offset == 0)
3198 : break;
3199 138 : key.offset = found_key.offset - 1;
3200 138 : }
3201 :
3202 44 : if (counting) {
3203 22 : btrfs_release_path(path);
3204 : counting = false;
3205 22 : goto again;
3206 : }
3207 : error:
3208 22 : btrfs_free_path(path);
3209 22 : if (enospc_errors) {
3210 0 : btrfs_info(fs_info, "%d enospc errors during balance",
3211 : enospc_errors);
3212 0 : if (!ret)
3213 : ret = -ENOSPC;
3214 : }
3215 :
3216 22 : return ret;
3217 : }
3218 :
3219 : /**
3220 : * alloc_profile_is_valid - see if a given profile is valid and reduced
3221 : * @flags: profile to validate
3222 : * @extended: if true @flags is treated as an extended profile
3223 : */
3224 : static int alloc_profile_is_valid(u64 flags, int extended)
3225 : {
3226 : u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
3227 : BTRFS_BLOCK_GROUP_PROFILE_MASK);
3228 :
3229 87 : flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
3230 :
3231 : /* 1) check that all other bits are zeroed */
3232 87 : if (flags & ~mask)
3233 : return 0;
3234 :
3235 : /* 2) see if profile is reduced */
3236 87 : if (flags == 0)
3237 : return !extended; /* "0" is valid for usual profiles */
3238 :
3239 : /* true if exactly one bit set */
3240 46 : return (flags & (flags - 1)) == 0;
3241 : }
3242 :
3243 : static inline int balance_need_close(struct btrfs_fs_info *fs_info)
3244 : {
3245 : /* cancel requested || normal exit path */
3246 44 : return atomic_read(&fs_info->balance_cancel_req) ||
3247 22 : (atomic_read(&fs_info->balance_pause_req) == 0 &&
3248 : atomic_read(&fs_info->balance_cancel_req) == 0);
3249 : }
3250 :
3251 22 : static void __cancel_balance(struct btrfs_fs_info *fs_info)
3252 : {
3253 : int ret;
3254 :
3255 22 : unset_balance_control(fs_info);
3256 22 : ret = del_balance_item(fs_info->tree_root);
3257 22 : if (ret)
3258 0 : btrfs_std_error(fs_info, ret);
3259 :
3260 : atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
3261 22 : }
3262 :
3263 : /*
3264 : * Should be called with both balance and volume mutexes held
3265 : */
3266 22 : int btrfs_balance(struct btrfs_balance_control *bctl,
3267 : struct btrfs_ioctl_balance_args *bargs)
3268 : {
3269 22 : struct btrfs_fs_info *fs_info = bctl->fs_info;
3270 : u64 allowed;
3271 : int mixed = 0;
3272 : int ret;
3273 : u64 num_devices;
3274 : unsigned seq;
3275 :
3276 44 : if (btrfs_fs_closing(fs_info) ||
3277 22 : atomic_read(&fs_info->balance_pause_req) ||
3278 : atomic_read(&fs_info->balance_cancel_req)) {
3279 : ret = -EINVAL;
3280 : goto out;
3281 : }
3282 :
3283 22 : allowed = btrfs_super_incompat_flags(fs_info->super_copy);
3284 22 : if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
3285 : mixed = 1;
3286 :
3287 : /*
3288 : * In case of mixed groups both data and meta should be picked,
3289 : * and identical options should be given for both of them.
3290 : */
3291 : allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA;
3292 22 : if (mixed && (bctl->flags & allowed)) {
3293 0 : if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
3294 0 : !(bctl->flags & BTRFS_BALANCE_METADATA) ||
3295 0 : memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
3296 0 : btrfs_err(fs_info, "with mixed groups data and "
3297 : "metadata balance options must be the same");
3298 : ret = -EINVAL;
3299 0 : goto out;
3300 : }
3301 : }
3302 :
3303 22 : num_devices = fs_info->fs_devices->num_devices;
3304 22 : btrfs_dev_replace_lock(&fs_info->dev_replace);
3305 22 : if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
3306 0 : BUG_ON(num_devices < 1);
3307 0 : num_devices--;
3308 : }
3309 22 : btrfs_dev_replace_unlock(&fs_info->dev_replace);
3310 : allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
3311 22 : if (num_devices == 1)
3312 : allowed |= BTRFS_BLOCK_GROUP_DUP;
3313 0 : else if (num_devices > 1)
3314 : allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
3315 22 : if (num_devices > 2)
3316 0 : allowed |= BTRFS_BLOCK_GROUP_RAID5;
3317 22 : if (num_devices > 3)
3318 0 : allowed |= (BTRFS_BLOCK_GROUP_RAID10 |
3319 : BTRFS_BLOCK_GROUP_RAID6);
3320 22 : if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3321 0 : (!alloc_profile_is_valid(bctl->data.target, 1) ||
3322 0 : (bctl->data.target & ~allowed))) {
3323 0 : btrfs_err(fs_info, "unable to start balance with target "
3324 : "data profile %llu",
3325 : bctl->data.target);
3326 : ret = -EINVAL;
3327 0 : goto out;
3328 : }
3329 22 : if ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3330 0 : (!alloc_profile_is_valid(bctl->meta.target, 1) ||
3331 0 : (bctl->meta.target & ~allowed))) {
3332 0 : btrfs_err(fs_info,
3333 : "unable to start balance with target metadata profile %llu",
3334 : bctl->meta.target);
3335 : ret = -EINVAL;
3336 0 : goto out;
3337 : }
3338 22 : if ((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3339 0 : (!alloc_profile_is_valid(bctl->sys.target, 1) ||
3340 0 : (bctl->sys.target & ~allowed))) {
3341 0 : btrfs_err(fs_info,
3342 : "unable to start balance with target system profile %llu",
3343 : bctl->sys.target);
3344 : ret = -EINVAL;
3345 0 : goto out;
3346 : }
3347 :
3348 : /* allow dup'ed data chunks only in mixed mode */
3349 22 : if (!mixed && (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3350 0 : (bctl->data.target & BTRFS_BLOCK_GROUP_DUP)) {
3351 0 : btrfs_err(fs_info, "dup for data is not allowed");
3352 : ret = -EINVAL;
3353 0 : goto out;
3354 : }
3355 :
3356 : /* allow to reduce meta or sys integrity only if force set */
3357 : allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
3358 : BTRFS_BLOCK_GROUP_RAID10 |
3359 : BTRFS_BLOCK_GROUP_RAID5 |
3360 : BTRFS_BLOCK_GROUP_RAID6;
3361 : do {
3362 : seq = read_seqbegin(&fs_info->profiles_lock);
3363 :
3364 22 : if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3365 0 : (fs_info->avail_system_alloc_bits & allowed) &&
3366 22 : !(bctl->sys.target & allowed)) ||
3367 22 : ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3368 0 : (fs_info->avail_metadata_alloc_bits & allowed) &&
3369 0 : !(bctl->meta.target & allowed))) {
3370 0 : if (bctl->flags & BTRFS_BALANCE_FORCE) {
3371 0 : btrfs_info(fs_info, "force reducing metadata integrity");
3372 : } else {
3373 0 : btrfs_err(fs_info, "balance will reduce metadata "
3374 : "integrity, use force if you want this");
3375 : ret = -EINVAL;
3376 0 : goto out;
3377 : }
3378 : }
3379 22 : } while (read_seqretry(&fs_info->profiles_lock, seq));
3380 :
3381 22 : if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3382 : int num_tolerated_disk_barrier_failures;
3383 0 : u64 target = bctl->sys.target;
3384 :
3385 0 : num_tolerated_disk_barrier_failures =
3386 : btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
3387 0 : if (num_tolerated_disk_barrier_failures > 0 &&
3388 0 : (target &
3389 : (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
3390 : BTRFS_AVAIL_ALLOC_BIT_SINGLE)))
3391 : num_tolerated_disk_barrier_failures = 0;
3392 0 : else if (num_tolerated_disk_barrier_failures > 1 &&
3393 0 : (target &
3394 : (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)))
3395 : num_tolerated_disk_barrier_failures = 1;
3396 :
3397 0 : fs_info->num_tolerated_disk_barrier_failures =
3398 : num_tolerated_disk_barrier_failures;
3399 : }
3400 :
3401 22 : ret = insert_balance_item(fs_info->tree_root, bctl);
3402 22 : if (ret && ret != -EEXIST)
3403 : goto out;
3404 :
3405 22 : if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
3406 22 : BUG_ON(ret == -EEXIST);
3407 22 : set_balance_control(bctl);
3408 : } else {
3409 0 : BUG_ON(ret != -EEXIST);
3410 : spin_lock(&fs_info->balance_lock);
3411 0 : update_balance_args(bctl);
3412 : spin_unlock(&fs_info->balance_lock);
3413 : }
3414 :
3415 22 : atomic_inc(&fs_info->balance_running);
3416 22 : mutex_unlock(&fs_info->balance_mutex);
3417 :
3418 22 : ret = __btrfs_balance(fs_info);
3419 :
3420 22 : mutex_lock(&fs_info->balance_mutex);
3421 : atomic_dec(&fs_info->balance_running);
3422 :
3423 22 : if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3424 0 : fs_info->num_tolerated_disk_barrier_failures =
3425 0 : btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
3426 : }
3427 :
3428 22 : if (bargs) {
3429 22 : memset(bargs, 0, sizeof(*bargs));
3430 22 : update_ioctl_balance_args(fs_info, 0, bargs);
3431 : }
3432 :
3433 44 : if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
3434 : balance_need_close(fs_info)) {
3435 22 : __cancel_balance(fs_info);
3436 : }
3437 :
3438 22 : wake_up(&fs_info->balance_wait_q);
3439 :
3440 22 : return ret;
3441 : out:
3442 0 : if (bctl->flags & BTRFS_BALANCE_RESUME)
3443 0 : __cancel_balance(fs_info);
3444 : else {
3445 0 : kfree(bctl);
3446 : atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
3447 : }
3448 0 : return ret;
3449 : }
3450 :
3451 0 : static int balance_kthread(void *data)
3452 : {
3453 : struct btrfs_fs_info *fs_info = data;
3454 : int ret = 0;
3455 :
3456 0 : mutex_lock(&fs_info->volume_mutex);
3457 0 : mutex_lock(&fs_info->balance_mutex);
3458 :
3459 0 : if (fs_info->balance_ctl) {
3460 0 : btrfs_info(fs_info, "continuing balance");
3461 0 : ret = btrfs_balance(fs_info->balance_ctl, NULL);
3462 : }
3463 :
3464 0 : mutex_unlock(&fs_info->balance_mutex);
3465 0 : mutex_unlock(&fs_info->volume_mutex);
3466 :
3467 0 : return ret;
3468 : }
3469 :
3470 194 : int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
3471 : {
3472 : struct task_struct *tsk;
3473 :
3474 : spin_lock(&fs_info->balance_lock);
3475 194 : if (!fs_info->balance_ctl) {
3476 : spin_unlock(&fs_info->balance_lock);
3477 194 : return 0;
3478 : }
3479 : spin_unlock(&fs_info->balance_lock);
3480 :
3481 0 : if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) {
3482 0 : btrfs_info(fs_info, "force skipping balance");
3483 0 : return 0;
3484 : }
3485 :
3486 0 : tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
3487 0 : return PTR_ERR_OR_ZERO(tsk);
3488 : }
3489 :
3490 221 : int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
3491 : {
3492 : struct btrfs_balance_control *bctl;
3493 : struct btrfs_balance_item *item;
3494 : struct btrfs_disk_balance_args disk_bargs;
3495 : struct btrfs_path *path;
3496 : struct extent_buffer *leaf;
3497 : struct btrfs_key key;
3498 : int ret;
3499 :
3500 221 : path = btrfs_alloc_path();
3501 221 : if (!path)
3502 : return -ENOMEM;
3503 :
3504 221 : key.objectid = BTRFS_BALANCE_OBJECTID;
3505 221 : key.type = BTRFS_BALANCE_ITEM_KEY;
3506 221 : key.offset = 0;
3507 :
3508 221 : ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
3509 221 : if (ret < 0)
3510 : goto out;
3511 221 : if (ret > 0) { /* ret = -ENOENT; */
3512 : ret = 0;
3513 : goto out;
3514 : }
3515 :
3516 0 : bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
3517 0 : if (!bctl) {
3518 : ret = -ENOMEM;
3519 : goto out;
3520 : }
3521 :
3522 0 : leaf = path->nodes[0];
3523 0 : item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
3524 :
3525 0 : bctl->fs_info = fs_info;
3526 0 : bctl->flags = btrfs_balance_flags(leaf, item);
3527 0 : bctl->flags |= BTRFS_BALANCE_RESUME;
3528 :
3529 : btrfs_balance_data(leaf, item, &disk_bargs);
3530 0 : btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
3531 : btrfs_balance_meta(leaf, item, &disk_bargs);
3532 0 : btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
3533 : btrfs_balance_sys(leaf, item, &disk_bargs);
3534 0 : btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
3535 :
3536 0 : WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1));
3537 :
3538 0 : mutex_lock(&fs_info->volume_mutex);
3539 0 : mutex_lock(&fs_info->balance_mutex);
3540 :
3541 0 : set_balance_control(bctl);
3542 :
3543 0 : mutex_unlock(&fs_info->balance_mutex);
3544 0 : mutex_unlock(&fs_info->volume_mutex);
3545 : out:
3546 221 : btrfs_free_path(path);
3547 221 : return ret;
3548 : }
3549 :
3550 222 : int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
3551 : {
3552 : int ret = 0;
3553 :
3554 222 : mutex_lock(&fs_info->balance_mutex);
3555 222 : if (!fs_info->balance_ctl) {
3556 222 : mutex_unlock(&fs_info->balance_mutex);
3557 222 : return -ENOTCONN;
3558 : }
3559 :
3560 0 : if (atomic_read(&fs_info->balance_running)) {
3561 0 : atomic_inc(&fs_info->balance_pause_req);
3562 0 : mutex_unlock(&fs_info->balance_mutex);
3563 :
3564 0 : wait_event(fs_info->balance_wait_q,
3565 : atomic_read(&fs_info->balance_running) == 0);
3566 :
3567 0 : mutex_lock(&fs_info->balance_mutex);
3568 : /* we are good with balance_ctl ripped off from under us */
3569 0 : BUG_ON(atomic_read(&fs_info->balance_running));
3570 : atomic_dec(&fs_info->balance_pause_req);
3571 : } else {
3572 : ret = -ENOTCONN;
3573 : }
3574 :
3575 0 : mutex_unlock(&fs_info->balance_mutex);
3576 0 : return ret;
3577 : }
3578 :
3579 0 : int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
3580 : {
3581 0 : if (fs_info->sb->s_flags & MS_RDONLY)
3582 : return -EROFS;
3583 :
3584 0 : mutex_lock(&fs_info->balance_mutex);
3585 0 : if (!fs_info->balance_ctl) {
3586 0 : mutex_unlock(&fs_info->balance_mutex);
3587 0 : return -ENOTCONN;
3588 : }
3589 :
3590 0 : atomic_inc(&fs_info->balance_cancel_req);
3591 : /*
3592 : * if we are running just wait and return, balance item is
3593 : * deleted in btrfs_balance in this case
3594 : */
3595 0 : if (atomic_read(&fs_info->balance_running)) {
3596 0 : mutex_unlock(&fs_info->balance_mutex);
3597 0 : wait_event(fs_info->balance_wait_q,
3598 : atomic_read(&fs_info->balance_running) == 0);
3599 0 : mutex_lock(&fs_info->balance_mutex);
3600 : } else {
3601 : /* __cancel_balance needs volume_mutex */
3602 0 : mutex_unlock(&fs_info->balance_mutex);
3603 0 : mutex_lock(&fs_info->volume_mutex);
3604 0 : mutex_lock(&fs_info->balance_mutex);
3605 :
3606 0 : if (fs_info->balance_ctl)
3607 0 : __cancel_balance(fs_info);
3608 :
3609 0 : mutex_unlock(&fs_info->volume_mutex);
3610 : }
3611 :
3612 0 : BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running));
3613 : atomic_dec(&fs_info->balance_cancel_req);
3614 0 : mutex_unlock(&fs_info->balance_mutex);
3615 0 : return 0;
3616 : }
3617 :
3618 99 : static int btrfs_uuid_scan_kthread(void *data)
3619 : {
3620 : struct btrfs_fs_info *fs_info = data;
3621 99 : struct btrfs_root *root = fs_info->tree_root;
3622 : struct btrfs_key key;
3623 : struct btrfs_key max_key;
3624 : struct btrfs_path *path = NULL;
3625 : int ret = 0;
3626 : struct extent_buffer *eb;
3627 : int slot;
3628 : struct btrfs_root_item root_item;
3629 : u32 item_size;
3630 : struct btrfs_trans_handle *trans = NULL;
3631 :
3632 99 : path = btrfs_alloc_path();
3633 99 : if (!path) {
3634 : ret = -ENOMEM;
3635 : goto out;
3636 : }
3637 :
3638 99 : key.objectid = 0;
3639 99 : key.type = BTRFS_ROOT_ITEM_KEY;
3640 99 : key.offset = 0;
3641 :
3642 : max_key.objectid = (u64)-1;
3643 : max_key.type = BTRFS_ROOT_ITEM_KEY;
3644 : max_key.offset = (u64)-1;
3645 :
3646 99 : path->keep_locks = 1;
3647 :
3648 : while (1) {
3649 1433 : ret = btrfs_search_forward(root, &key, path, 0);
3650 1433 : if (ret) {
3651 99 : if (ret > 0)
3652 : ret = 0;
3653 : break;
3654 : }
3655 :
3656 1929 : if (key.type != BTRFS_ROOT_ITEM_KEY ||
3657 595 : (key.objectid < BTRFS_FIRST_FREE_OBJECTID &&
3658 199 : key.objectid != BTRFS_FS_TREE_OBJECTID) ||
3659 : key.objectid > BTRFS_LAST_FREE_OBJECTID)
3660 : goto skip;
3661 :
3662 100 : eb = path->nodes[0];
3663 100 : slot = path->slots[0];
3664 : item_size = btrfs_item_size_nr(eb, slot);
3665 100 : if (item_size < sizeof(root_item))
3666 : goto skip;
3667 :
3668 100 : read_extent_buffer(eb, &root_item,
3669 : btrfs_item_ptr_offset(eb, slot),
3670 : (int)sizeof(root_item));
3671 100 : if (btrfs_root_refs(&root_item) == 0)
3672 : goto skip;
3673 :
3674 200 : if (!btrfs_is_empty_uuid(root_item.uuid) ||
3675 100 : !btrfs_is_empty_uuid(root_item.received_uuid)) {
3676 0 : if (trans)
3677 : goto update_tree;
3678 :
3679 0 : btrfs_release_path(path);
3680 : /*
3681 : * 1 - subvol uuid item
3682 : * 1 - received_subvol uuid item
3683 : */
3684 0 : trans = btrfs_start_transaction(fs_info->uuid_root, 2);
3685 0 : if (IS_ERR(trans)) {
3686 0 : ret = PTR_ERR(trans);
3687 0 : break;
3688 : }
3689 0 : continue;
3690 : } else {
3691 : goto skip;
3692 : }
3693 : update_tree:
3694 0 : if (!btrfs_is_empty_uuid(root_item.uuid)) {
3695 0 : ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root,
3696 : root_item.uuid,
3697 : BTRFS_UUID_KEY_SUBVOL,
3698 : key.objectid);
3699 0 : if (ret < 0) {
3700 0 : btrfs_warn(fs_info, "uuid_tree_add failed %d",
3701 : ret);
3702 0 : break;
3703 : }
3704 : }
3705 :
3706 0 : if (!btrfs_is_empty_uuid(root_item.received_uuid)) {
3707 0 : ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root,
3708 : root_item.received_uuid,
3709 : BTRFS_UUID_KEY_RECEIVED_SUBVOL,
3710 : key.objectid);
3711 0 : if (ret < 0) {
3712 0 : btrfs_warn(fs_info, "uuid_tree_add failed %d",
3713 : ret);
3714 0 : break;
3715 : }
3716 : }
3717 :
3718 : skip:
3719 1334 : if (trans) {
3720 0 : ret = btrfs_end_transaction(trans, fs_info->uuid_root);
3721 : trans = NULL;
3722 0 : if (ret)
3723 : break;
3724 : }
3725 :
3726 1334 : btrfs_release_path(path);
3727 1334 : if (key.offset < (u64)-1) {
3728 1334 : key.offset++;
3729 0 : } else if (key.type < BTRFS_ROOT_ITEM_KEY) {
3730 0 : key.offset = 0;
3731 0 : key.type = BTRFS_ROOT_ITEM_KEY;
3732 0 : } else if (key.objectid < (u64)-1) {
3733 0 : key.offset = 0;
3734 0 : key.type = BTRFS_ROOT_ITEM_KEY;
3735 0 : key.objectid++;
3736 : } else {
3737 : break;
3738 : }
3739 1334 : cond_resched();
3740 : }
3741 :
3742 : out:
3743 99 : btrfs_free_path(path);
3744 99 : if (trans && !IS_ERR(trans))
3745 0 : btrfs_end_transaction(trans, fs_info->uuid_root);
3746 99 : if (ret)
3747 0 : btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
3748 : else
3749 99 : fs_info->update_uuid_tree_gen = 1;
3750 99 : up(&fs_info->uuid_tree_rescan_sem);
3751 99 : return 0;
3752 : }
3753 :
3754 : /*
3755 : * Callback for btrfs_uuid_tree_iterate().
3756 : * returns:
3757 : * 0 check succeeded, the entry is not outdated.
3758 : * < 0 if an error occured.
3759 : * > 0 if the check failed, which means the caller shall remove the entry.
3760 : */
3761 0 : static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info,
3762 : u8 *uuid, u8 type, u64 subid)
3763 : {
3764 : struct btrfs_key key;
3765 : int ret = 0;
3766 : struct btrfs_root *subvol_root;
3767 :
3768 0 : if (type != BTRFS_UUID_KEY_SUBVOL &&
3769 : type != BTRFS_UUID_KEY_RECEIVED_SUBVOL)
3770 : goto out;
3771 :
3772 0 : key.objectid = subid;
3773 0 : key.type = BTRFS_ROOT_ITEM_KEY;
3774 0 : key.offset = (u64)-1;
3775 : subvol_root = btrfs_read_fs_root_no_name(fs_info, &key);
3776 0 : if (IS_ERR(subvol_root)) {
3777 0 : ret = PTR_ERR(subvol_root);
3778 0 : if (ret == -ENOENT)
3779 : ret = 1;
3780 : goto out;
3781 : }
3782 :
3783 0 : switch (type) {
3784 : case BTRFS_UUID_KEY_SUBVOL:
3785 0 : if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE))
3786 : ret = 1;
3787 : break;
3788 : case BTRFS_UUID_KEY_RECEIVED_SUBVOL:
3789 0 : if (memcmp(uuid, subvol_root->root_item.received_uuid,
3790 : BTRFS_UUID_SIZE))
3791 : ret = 1;
3792 : break;
3793 : }
3794 :
3795 : out:
3796 0 : return ret;
3797 : }
3798 :
3799 0 : static int btrfs_uuid_rescan_kthread(void *data)
3800 : {
3801 : struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data;
3802 : int ret;
3803 :
3804 : /*
3805 : * 1st step is to iterate through the existing UUID tree and
3806 : * to delete all entries that contain outdated data.
3807 : * 2nd step is to add all missing entries to the UUID tree.
3808 : */
3809 0 : ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry);
3810 0 : if (ret < 0) {
3811 0 : btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret);
3812 0 : up(&fs_info->uuid_tree_rescan_sem);
3813 0 : return ret;
3814 : }
3815 0 : return btrfs_uuid_scan_kthread(data);
3816 : }
3817 :
3818 99 : int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
3819 : {
3820 : struct btrfs_trans_handle *trans;
3821 99 : struct btrfs_root *tree_root = fs_info->tree_root;
3822 : struct btrfs_root *uuid_root;
3823 : struct task_struct *task;
3824 : int ret;
3825 :
3826 : /*
3827 : * 1 - root node
3828 : * 1 - root item
3829 : */
3830 99 : trans = btrfs_start_transaction(tree_root, 2);
3831 99 : if (IS_ERR(trans))
3832 0 : return PTR_ERR(trans);
3833 :
3834 99 : uuid_root = btrfs_create_tree(trans, fs_info,
3835 : BTRFS_UUID_TREE_OBJECTID);
3836 99 : if (IS_ERR(uuid_root)) {
3837 0 : btrfs_abort_transaction(trans, tree_root,
3838 : PTR_ERR(uuid_root));
3839 0 : return PTR_ERR(uuid_root);
3840 : }
3841 :
3842 99 : fs_info->uuid_root = uuid_root;
3843 :
3844 99 : ret = btrfs_commit_transaction(trans, tree_root);
3845 99 : if (ret)
3846 : return ret;
3847 :
3848 99 : down(&fs_info->uuid_tree_rescan_sem);
3849 198 : task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
3850 99 : if (IS_ERR(task)) {
3851 : /* fs_info->update_uuid_tree_gen remains 0 in all error case */
3852 0 : btrfs_warn(fs_info, "failed to start uuid_scan task");
3853 0 : up(&fs_info->uuid_tree_rescan_sem);
3854 0 : return PTR_ERR(task);
3855 : }
3856 :
3857 : return 0;
3858 : }
3859 :
3860 0 : int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
3861 : {
3862 : struct task_struct *task;
3863 :
3864 0 : down(&fs_info->uuid_tree_rescan_sem);
3865 0 : task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid");
3866 0 : if (IS_ERR(task)) {
3867 : /* fs_info->update_uuid_tree_gen remains 0 in all error case */
3868 0 : btrfs_warn(fs_info, "failed to start uuid_rescan task");
3869 0 : up(&fs_info->uuid_tree_rescan_sem);
3870 0 : return PTR_ERR(task);
3871 : }
3872 :
3873 : return 0;
3874 : }
3875 :
3876 : /*
3877 : * shrinking a device means finding all of the device extents past
3878 : * the new size, and then following the back refs to the chunks.
3879 : * The chunk relocation code actually frees the device extent
3880 : */
3881 0 : int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
3882 : {
3883 : struct btrfs_trans_handle *trans;
3884 0 : struct btrfs_root *root = device->dev_root;
3885 : struct btrfs_dev_extent *dev_extent = NULL;
3886 : struct btrfs_path *path;
3887 : u64 length;
3888 : u64 chunk_tree;
3889 : u64 chunk_objectid;
3890 : u64 chunk_offset;
3891 : int ret;
3892 : int slot;
3893 : int failed = 0;
3894 : bool retried = false;
3895 : struct extent_buffer *l;
3896 : struct btrfs_key key;
3897 0 : struct btrfs_super_block *super_copy = root->fs_info->super_copy;
3898 : u64 old_total = btrfs_super_total_bytes(super_copy);
3899 0 : u64 old_size = device->total_bytes;
3900 0 : u64 diff = device->total_bytes - new_size;
3901 :
3902 0 : if (device->is_tgtdev_for_dev_replace)
3903 : return -EINVAL;
3904 :
3905 0 : path = btrfs_alloc_path();
3906 0 : if (!path)
3907 : return -ENOMEM;
3908 :
3909 0 : path->reada = 2;
3910 :
3911 : lock_chunks(root);
3912 :
3913 0 : device->total_bytes = new_size;
3914 0 : if (device->writeable) {
3915 0 : device->fs_devices->total_rw_bytes -= diff;
3916 0 : spin_lock(&root->fs_info->free_chunk_lock);
3917 0 : root->fs_info->free_chunk_space -= diff;
3918 0 : spin_unlock(&root->fs_info->free_chunk_lock);
3919 : }
3920 : unlock_chunks(root);
3921 :
3922 : again:
3923 0 : key.objectid = device->devid;
3924 0 : key.offset = (u64)-1;
3925 0 : key.type = BTRFS_DEV_EXTENT_KEY;
3926 :
3927 : do {
3928 0 : ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3929 0 : if (ret < 0)
3930 : goto done;
3931 :
3932 0 : ret = btrfs_previous_item(root, path, 0, key.type);
3933 0 : if (ret < 0)
3934 : goto done;
3935 0 : if (ret) {
3936 : ret = 0;
3937 0 : btrfs_release_path(path);
3938 0 : break;
3939 : }
3940 :
3941 0 : l = path->nodes[0];
3942 0 : slot = path->slots[0];
3943 0 : btrfs_item_key_to_cpu(l, &key, path->slots[0]);
3944 :
3945 0 : if (key.objectid != device->devid) {
3946 0 : btrfs_release_path(path);
3947 0 : break;
3948 : }
3949 :
3950 0 : dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
3951 : length = btrfs_dev_extent_length(l, dev_extent);
3952 :
3953 0 : if (key.offset + length <= new_size) {
3954 0 : btrfs_release_path(path);
3955 0 : break;
3956 : }
3957 :
3958 : chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
3959 : chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
3960 : chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
3961 0 : btrfs_release_path(path);
3962 :
3963 0 : ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
3964 : chunk_offset);
3965 0 : if (ret && ret != -ENOSPC)
3966 : goto done;
3967 0 : if (ret == -ENOSPC)
3968 0 : failed++;
3969 0 : } while (key.offset-- > 0);
3970 :
3971 0 : if (failed && !retried) {
3972 : failed = 0;
3973 : retried = true;
3974 : goto again;
3975 0 : } else if (failed && retried) {
3976 : ret = -ENOSPC;
3977 : lock_chunks(root);
3978 :
3979 0 : device->total_bytes = old_size;
3980 0 : if (device->writeable)
3981 0 : device->fs_devices->total_rw_bytes += diff;
3982 0 : spin_lock(&root->fs_info->free_chunk_lock);
3983 0 : root->fs_info->free_chunk_space += diff;
3984 0 : spin_unlock(&root->fs_info->free_chunk_lock);
3985 : unlock_chunks(root);
3986 : goto done;
3987 : }
3988 :
3989 : /* Shrinking succeeded, else we would be at "done". */
3990 0 : trans = btrfs_start_transaction(root, 0);
3991 0 : if (IS_ERR(trans)) {
3992 0 : ret = PTR_ERR(trans);
3993 0 : goto done;
3994 : }
3995 :
3996 : lock_chunks(root);
3997 :
3998 0 : device->disk_total_bytes = new_size;
3999 : /* Now btrfs_update_device() will change the on-disk size. */
4000 0 : ret = btrfs_update_device(trans, device);
4001 0 : if (ret) {
4002 : unlock_chunks(root);
4003 0 : btrfs_end_transaction(trans, root);
4004 0 : goto done;
4005 : }
4006 0 : WARN_ON(diff > old_total);
4007 0 : btrfs_set_super_total_bytes(super_copy, old_total - diff);
4008 : unlock_chunks(root);
4009 0 : btrfs_end_transaction(trans, root);
4010 : done:
4011 0 : btrfs_free_path(path);
4012 0 : return ret;
4013 : }
4014 :
4015 22 : static int btrfs_add_system_chunk(struct btrfs_root *root,
4016 : struct btrfs_key *key,
4017 : struct btrfs_chunk *chunk, int item_size)
4018 : {
4019 22 : struct btrfs_super_block *super_copy = root->fs_info->super_copy;
4020 : struct btrfs_disk_key disk_key;
4021 : u32 array_size;
4022 : u8 *ptr;
4023 :
4024 : array_size = btrfs_super_sys_array_size(super_copy);
4025 22 : if (array_size + item_size + sizeof(disk_key)
4026 : > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
4027 : return -EFBIG;
4028 :
4029 22 : ptr = super_copy->sys_chunk_array + array_size;
4030 : btrfs_cpu_key_to_disk(&disk_key, key);
4031 22 : memcpy(ptr, &disk_key, sizeof(disk_key));
4032 22 : ptr += sizeof(disk_key);
4033 22 : memcpy(ptr, chunk, item_size);
4034 22 : item_size += sizeof(disk_key);
4035 22 : btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
4036 : return 0;
4037 : }
4038 :
4039 : /*
4040 : * sort the devices in descending order by max_avail, total_avail
4041 : */
4042 2 : static int btrfs_cmp_device_info(const void *a, const void *b)
4043 : {
4044 : const struct btrfs_device_info *di_a = a;
4045 : const struct btrfs_device_info *di_b = b;
4046 :
4047 2 : if (di_a->max_avail > di_b->max_avail)
4048 : return -1;
4049 2 : if (di_a->max_avail < di_b->max_avail)
4050 : return 1;
4051 2 : if (di_a->total_avail > di_b->total_avail)
4052 : return -1;
4053 2 : if (di_a->total_avail < di_b->total_avail)
4054 : return 1;
4055 0 : return 0;
4056 : }
4057 :
4058 : static struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
4059 : [BTRFS_RAID_RAID10] = {
4060 : .sub_stripes = 2,
4061 : .dev_stripes = 1,
4062 : .devs_max = 0, /* 0 == as many as possible */
4063 : .devs_min = 4,
4064 : .devs_increment = 2,
4065 : .ncopies = 2,
4066 : },
4067 : [BTRFS_RAID_RAID1] = {
4068 : .sub_stripes = 1,
4069 : .dev_stripes = 1,
4070 : .devs_max = 2,
4071 : .devs_min = 2,
4072 : .devs_increment = 2,
4073 : .ncopies = 2,
4074 : },
4075 : [BTRFS_RAID_DUP] = {
4076 : .sub_stripes = 1,
4077 : .dev_stripes = 2,
4078 : .devs_max = 1,
4079 : .devs_min = 1,
4080 : .devs_increment = 1,
4081 : .ncopies = 2,
4082 : },
4083 : [BTRFS_RAID_RAID0] = {
4084 : .sub_stripes = 1,
4085 : .dev_stripes = 1,
4086 : .devs_max = 0,
4087 : .devs_min = 2,
4088 : .devs_increment = 1,
4089 : .ncopies = 1,
4090 : },
4091 : [BTRFS_RAID_SINGLE] = {
4092 : .sub_stripes = 1,
4093 : .dev_stripes = 1,
4094 : .devs_max = 1,
4095 : .devs_min = 1,
4096 : .devs_increment = 1,
4097 : .ncopies = 1,
4098 : },
4099 : [BTRFS_RAID_RAID5] = {
4100 : .sub_stripes = 1,
4101 : .dev_stripes = 1,
4102 : .devs_max = 0,
4103 : .devs_min = 2,
4104 : .devs_increment = 1,
4105 : .ncopies = 2,
4106 : },
4107 : [BTRFS_RAID_RAID6] = {
4108 : .sub_stripes = 1,
4109 : .dev_stripes = 1,
4110 : .devs_max = 0,
4111 : .devs_min = 3,
4112 : .devs_increment = 1,
4113 : .ncopies = 3,
4114 : },
4115 : };
4116 :
4117 : static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target)
4118 : {
4119 : /* TODO allow them to set a preferred stripe size */
4120 : return 64 * 1024;
4121 : }
4122 :
4123 : static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
4124 : {
4125 87 : if (!(type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)))
4126 : return;
4127 :
4128 0 : btrfs_set_fs_incompat(info, RAID56);
4129 : }
4130 :
4131 : #define BTRFS_MAX_DEVS(r) ((BTRFS_LEAF_DATA_SIZE(r) \
4132 : - sizeof(struct btrfs_item) \
4133 : - sizeof(struct btrfs_chunk)) \
4134 : / sizeof(struct btrfs_stripe) + 1)
4135 :
4136 : #define BTRFS_MAX_DEVS_SYS_CHUNK ((BTRFS_SYSTEM_CHUNK_ARRAY_SIZE \
4137 : - 2 * sizeof(struct btrfs_disk_key) \
4138 : - 2 * sizeof(struct btrfs_chunk)) \
4139 : / sizeof(struct btrfs_stripe) + 1)
4140 :
4141 87 : static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
4142 : struct btrfs_root *extent_root, u64 start,
4143 : u64 type)
4144 : {
4145 87 : struct btrfs_fs_info *info = extent_root->fs_info;
4146 87 : struct btrfs_fs_devices *fs_devices = info->fs_devices;
4147 : struct list_head *cur;
4148 : struct map_lookup *map = NULL;
4149 : struct extent_map_tree *em_tree;
4150 : struct extent_map *em;
4151 : struct btrfs_device_info *devices_info = NULL;
4152 : u64 total_avail;
4153 : int num_stripes; /* total number of stripes to allocate */
4154 : int data_stripes; /* number of stripes that count for
4155 : block group size */
4156 : int sub_stripes; /* sub_stripes info for map */
4157 : int dev_stripes; /* stripes per dev */
4158 : int devs_max; /* max devs to use */
4159 : int devs_min; /* min devs needed */
4160 : int devs_increment; /* ndevs has to be a multiple of this */
4161 : int ncopies; /* how many copies to data has */
4162 : int ret;
4163 : u64 max_stripe_size;
4164 : u64 max_chunk_size;
4165 : u64 stripe_size;
4166 : u64 num_bytes;
4167 : u64 raid_stripe_len = BTRFS_STRIPE_LEN;
4168 : int ndevs;
4169 : int i;
4170 : int j;
4171 : int index;
4172 :
4173 87 : BUG_ON(!alloc_profile_is_valid(type, 0));
4174 :
4175 174 : if (list_empty(&fs_devices->alloc_list))
4176 : return -ENOSPC;
4177 :
4178 87 : index = __get_raid_index(type);
4179 :
4180 87 : sub_stripes = btrfs_raid_array[index].sub_stripes;
4181 87 : dev_stripes = btrfs_raid_array[index].dev_stripes;
4182 87 : devs_max = btrfs_raid_array[index].devs_max;
4183 87 : devs_min = btrfs_raid_array[index].devs_min;
4184 87 : devs_increment = btrfs_raid_array[index].devs_increment;
4185 87 : ncopies = btrfs_raid_array[index].ncopies;
4186 :
4187 87 : if (type & BTRFS_BLOCK_GROUP_DATA) {
4188 : max_stripe_size = 1024 * 1024 * 1024;
4189 : max_chunk_size = 10 * max_stripe_size;
4190 42 : if (!devs_max)
4191 0 : devs_max = BTRFS_MAX_DEVS(info->chunk_root);
4192 45 : } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
4193 : /* for larger filesystems, use larger metadata chunks */
4194 23 : if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024)
4195 : max_stripe_size = 1024 * 1024 * 1024;
4196 : else
4197 : max_stripe_size = 256 * 1024 * 1024;
4198 : max_chunk_size = max_stripe_size;
4199 23 : if (!devs_max)
4200 0 : devs_max = BTRFS_MAX_DEVS(info->chunk_root);
4201 22 : } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
4202 : max_stripe_size = 32 * 1024 * 1024;
4203 : max_chunk_size = 2 * max_stripe_size;
4204 22 : if (!devs_max)
4205 : devs_max = BTRFS_MAX_DEVS_SYS_CHUNK;
4206 : } else {
4207 0 : btrfs_err(info, "invalid chunk type 0x%llx requested",
4208 : type);
4209 0 : BUG_ON(1);
4210 : }
4211 :
4212 : /* we don't want a chunk larger than 10% of writeable space */
4213 174 : max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
4214 : max_chunk_size);
4215 :
4216 87 : devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices,
4217 : GFP_NOFS);
4218 87 : if (!devices_info)
4219 : return -ENOMEM;
4220 :
4221 87 : cur = fs_devices->alloc_list.next;
4222 :
4223 : /*
4224 : * in the first pass through the devices list, we gather information
4225 : * about the available holes on each device.
4226 : */
4227 : ndevs = 0;
4228 263 : while (cur != &fs_devices->alloc_list) {
4229 : struct btrfs_device *device;
4230 : u64 max_avail;
4231 : u64 dev_offset;
4232 :
4233 89 : device = list_entry(cur, struct btrfs_device, dev_alloc_list);
4234 :
4235 89 : cur = cur->next;
4236 :
4237 89 : if (!device->writeable) {
4238 0 : WARN(1, KERN_ERR
4239 : "BTRFS: read-only device in alloc_list\n");
4240 0 : continue;
4241 : }
4242 :
4243 178 : if (!device->in_fs_metadata ||
4244 89 : device->is_tgtdev_for_dev_replace)
4245 0 : continue;
4246 :
4247 89 : if (device->total_bytes > device->bytes_used)
4248 89 : total_avail = device->total_bytes - device->bytes_used;
4249 : else
4250 : total_avail = 0;
4251 :
4252 : /* If there is no space on this device, skip it. */
4253 89 : if (total_avail == 0)
4254 0 : continue;
4255 :
4256 89 : ret = find_free_dev_extent(trans, device,
4257 : max_stripe_size * dev_stripes,
4258 : &dev_offset, &max_avail);
4259 89 : if (ret && ret != -ENOSPC)
4260 : goto error;
4261 :
4262 89 : if (ret == 0)
4263 86 : max_avail = max_stripe_size * dev_stripes;
4264 :
4265 89 : if (max_avail < BTRFS_STRIPE_LEN * dev_stripes)
4266 0 : continue;
4267 :
4268 89 : if (ndevs == fs_devices->rw_devices) {
4269 0 : WARN(1, "%s: found more than %llu devices\n",
4270 : __func__, fs_devices->rw_devices);
4271 0 : break;
4272 : }
4273 89 : devices_info[ndevs].dev_offset = dev_offset;
4274 89 : devices_info[ndevs].max_avail = max_avail;
4275 89 : devices_info[ndevs].total_avail = total_avail;
4276 89 : devices_info[ndevs].dev = device;
4277 89 : ++ndevs;
4278 : }
4279 :
4280 : /*
4281 : * now sort the devices by hole size / available space
4282 : */
4283 87 : sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
4284 : btrfs_cmp_device_info, NULL);
4285 :
4286 : /* round down to number of usable stripes */
4287 87 : ndevs -= ndevs % devs_increment;
4288 :
4289 87 : if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) {
4290 : ret = -ENOSPC;
4291 : goto error;
4292 : }
4293 :
4294 87 : if (devs_max && ndevs > devs_max)
4295 : ndevs = devs_max;
4296 : /*
4297 : * the primary goal is to maximize the number of stripes, so use as many
4298 : * devices as possible, even if the stripes are not maximum sized.
4299 : */
4300 87 : stripe_size = devices_info[ndevs-1].max_avail;
4301 87 : num_stripes = ndevs * dev_stripes;
4302 :
4303 : /*
4304 : * this will have to be fixed for RAID1 and RAID10 over
4305 : * more drives
4306 : */
4307 87 : data_stripes = num_stripes / ncopies;
4308 :
4309 87 : if (type & BTRFS_BLOCK_GROUP_RAID5) {
4310 : raid_stripe_len = find_raid56_stripe_len(ndevs - 1,
4311 : btrfs_super_stripesize(info->super_copy));
4312 0 : data_stripes = num_stripes - 1;
4313 : }
4314 87 : if (type & BTRFS_BLOCK_GROUP_RAID6) {
4315 : raid_stripe_len = find_raid56_stripe_len(ndevs - 2,
4316 : btrfs_super_stripesize(info->super_copy));
4317 0 : data_stripes = num_stripes - 2;
4318 : }
4319 :
4320 : /*
4321 : * Use the number of data stripes to figure out how big this chunk
4322 : * is really going to be in terms of logical address space,
4323 : * and compare that answer with the max chunk size
4324 : */
4325 87 : if (stripe_size * data_stripes > max_chunk_size) {
4326 : u64 mask = (1ULL << 24) - 1;
4327 : stripe_size = max_chunk_size;
4328 26 : do_div(stripe_size, data_stripes);
4329 :
4330 : /* bump the answer up to a 16MB boundary */
4331 26 : stripe_size = (stripe_size + mask) & ~mask;
4332 :
4333 : /* but don't go higher than the limits we found
4334 : * while searching for free extents
4335 : */
4336 26 : if (stripe_size > devices_info[ndevs-1].max_avail)
4337 : stripe_size = devices_info[ndevs-1].max_avail;
4338 : }
4339 :
4340 87 : do_div(stripe_size, dev_stripes);
4341 :
4342 : /* align to BTRFS_STRIPE_LEN */
4343 87 : do_div(stripe_size, raid_stripe_len);
4344 87 : stripe_size *= raid_stripe_len;
4345 :
4346 87 : map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
4347 87 : if (!map) {
4348 : ret = -ENOMEM;
4349 : goto error;
4350 : }
4351 87 : map->num_stripes = num_stripes;
4352 :
4353 176 : for (i = 0; i < ndevs; ++i) {
4354 133 : for (j = 0; j < dev_stripes; ++j) {
4355 133 : int s = i * dev_stripes + j;
4356 133 : map->stripes[s].dev = devices_info[i].dev;
4357 266 : map->stripes[s].physical = devices_info[i].dev_offset +
4358 133 : j * stripe_size;
4359 : }
4360 : }
4361 87 : map->sector_size = extent_root->sectorsize;
4362 87 : map->stripe_len = raid_stripe_len;
4363 87 : map->io_align = raid_stripe_len;
4364 87 : map->io_width = raid_stripe_len;
4365 87 : map->type = type;
4366 87 : map->sub_stripes = sub_stripes;
4367 :
4368 87 : num_bytes = stripe_size * data_stripes;
4369 :
4370 87 : trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes);
4371 :
4372 87 : em = alloc_extent_map();
4373 87 : if (!em) {
4374 0 : kfree(map);
4375 : ret = -ENOMEM;
4376 0 : goto error;
4377 : }
4378 : set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
4379 87 : em->bdev = (struct block_device *)map;
4380 87 : em->start = start;
4381 87 : em->len = num_bytes;
4382 87 : em->block_start = 0;
4383 87 : em->block_len = em->len;
4384 87 : em->orig_block_len = stripe_size;
4385 :
4386 87 : em_tree = &extent_root->fs_info->mapping_tree.map_tree;
4387 87 : write_lock(&em_tree->lock);
4388 87 : ret = add_extent_mapping(em_tree, em, 0);
4389 87 : if (!ret) {
4390 87 : list_add_tail(&em->list, &trans->transaction->pending_chunks);
4391 87 : atomic_inc(&em->refs);
4392 : }
4393 : write_unlock(&em_tree->lock);
4394 87 : if (ret) {
4395 0 : free_extent_map(em);
4396 0 : goto error;
4397 : }
4398 :
4399 87 : ret = btrfs_make_block_group(trans, extent_root, 0, type,
4400 : BTRFS_FIRST_CHUNK_TREE_OBJECTID,
4401 : start, num_bytes);
4402 87 : if (ret)
4403 : goto error_del_extent;
4404 :
4405 87 : free_extent_map(em);
4406 87 : check_raid56_incompat_flag(extent_root->fs_info, type);
4407 :
4408 87 : kfree(devices_info);
4409 87 : return 0;
4410 :
4411 : error_del_extent:
4412 0 : write_lock(&em_tree->lock);
4413 0 : remove_extent_mapping(em_tree, em);
4414 : write_unlock(&em_tree->lock);
4415 :
4416 : /* One for our allocation */
4417 0 : free_extent_map(em);
4418 : /* One for the tree reference */
4419 0 : free_extent_map(em);
4420 : error:
4421 0 : kfree(devices_info);
4422 0 : return ret;
4423 : }
4424 :
4425 87 : int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
4426 : struct btrfs_root *extent_root,
4427 : u64 chunk_offset, u64 chunk_size)
4428 : {
4429 : struct btrfs_key key;
4430 109 : struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
4431 : struct btrfs_device *device;
4432 : struct btrfs_chunk *chunk;
4433 : struct btrfs_stripe *stripe;
4434 : struct extent_map_tree *em_tree;
4435 : struct extent_map *em;
4436 : struct map_lookup *map;
4437 : size_t item_size;
4438 : u64 dev_offset;
4439 : u64 stripe_size;
4440 : int i = 0;
4441 : int ret;
4442 :
4443 87 : em_tree = &extent_root->fs_info->mapping_tree.map_tree;
4444 87 : read_lock(&em_tree->lock);
4445 87 : em = lookup_extent_mapping(em_tree, chunk_offset, chunk_size);
4446 : read_unlock(&em_tree->lock);
4447 :
4448 87 : if (!em) {
4449 0 : btrfs_crit(extent_root->fs_info, "unable to find logical "
4450 : "%Lu len %Lu", chunk_offset, chunk_size);
4451 0 : return -EINVAL;
4452 : }
4453 :
4454 87 : if (em->start != chunk_offset || em->len != chunk_size) {
4455 0 : btrfs_crit(extent_root->fs_info, "found a bad mapping, wanted"
4456 : " %Lu-%Lu, found %Lu-%Lu", chunk_offset,
4457 : chunk_size, em->start, em->len);
4458 0 : free_extent_map(em);
4459 0 : return -EINVAL;
4460 : }
4461 :
4462 87 : map = (struct map_lookup *)em->bdev;
4463 87 : item_size = btrfs_chunk_item_size(map->num_stripes);
4464 87 : stripe_size = em->orig_block_len;
4465 :
4466 87 : chunk = kzalloc(item_size, GFP_NOFS);
4467 87 : if (!chunk) {
4468 : ret = -ENOMEM;
4469 : goto out;
4470 : }
4471 :
4472 133 : for (i = 0; i < map->num_stripes; i++) {
4473 133 : device = map->stripes[i].dev;
4474 133 : dev_offset = map->stripes[i].physical;
4475 :
4476 133 : device->bytes_used += stripe_size;
4477 133 : ret = btrfs_update_device(trans, device);
4478 133 : if (ret)
4479 : goto out;
4480 133 : ret = btrfs_alloc_dev_extent(trans, device,
4481 : chunk_root->root_key.objectid,
4482 : BTRFS_FIRST_CHUNK_TREE_OBJECTID,
4483 : chunk_offset, dev_offset,
4484 : stripe_size);
4485 133 : if (ret)
4486 : goto out;
4487 : }
4488 :
4489 87 : spin_lock(&extent_root->fs_info->free_chunk_lock);
4490 174 : extent_root->fs_info->free_chunk_space -= (stripe_size *
4491 87 : map->num_stripes);
4492 87 : spin_unlock(&extent_root->fs_info->free_chunk_lock);
4493 :
4494 87 : stripe = &chunk->stripe;
4495 220 : for (i = 0; i < map->num_stripes; i++) {
4496 133 : device = map->stripes[i].dev;
4497 133 : dev_offset = map->stripes[i].physical;
4498 :
4499 133 : btrfs_set_stack_stripe_devid(stripe, device->devid);
4500 : btrfs_set_stack_stripe_offset(stripe, dev_offset);
4501 133 : memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
4502 133 : stripe++;
4503 : }
4504 :
4505 : btrfs_set_stack_chunk_length(chunk, chunk_size);
4506 87 : btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
4507 87 : btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
4508 87 : btrfs_set_stack_chunk_type(chunk, map->type);
4509 87 : btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
4510 87 : btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
4511 87 : btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
4512 87 : btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize);
4513 87 : btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
4514 :
4515 87 : key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
4516 87 : key.type = BTRFS_CHUNK_ITEM_KEY;
4517 87 : key.offset = chunk_offset;
4518 :
4519 87 : ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
4520 87 : if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
4521 : /*
4522 : * TODO: Cleanup of inserted chunk root in case of
4523 : * failure.
4524 : */
4525 44 : ret = btrfs_add_system_chunk(chunk_root, &key, chunk,
4526 : item_size);
4527 : }
4528 :
4529 : out:
4530 87 : kfree(chunk);
4531 87 : free_extent_map(em);
4532 87 : return ret;
4533 : }
4534 :
4535 : /*
4536 : * Chunk allocation falls into two parts. The first part does works
4537 : * that make the new allocated chunk useable, but not do any operation
4538 : * that modifies the chunk tree. The second part does the works that
4539 : * require modifying the chunk tree. This division is important for the
4540 : * bootstrap process of adding storage to a seed btrfs.
4541 : */
4542 87 : int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
4543 : struct btrfs_root *extent_root, u64 type)
4544 : {
4545 : u64 chunk_offset;
4546 :
4547 87 : chunk_offset = find_next_chunk(extent_root->fs_info);
4548 87 : return __btrfs_alloc_chunk(trans, extent_root, chunk_offset, type);
4549 : }
4550 :
4551 0 : static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
4552 : struct btrfs_root *root,
4553 : struct btrfs_device *device)
4554 : {
4555 : u64 chunk_offset;
4556 : u64 sys_chunk_offset;
4557 : u64 alloc_profile;
4558 0 : struct btrfs_fs_info *fs_info = root->fs_info;
4559 0 : struct btrfs_root *extent_root = fs_info->extent_root;
4560 : int ret;
4561 :
4562 0 : chunk_offset = find_next_chunk(fs_info);
4563 0 : alloc_profile = btrfs_get_alloc_profile(extent_root, 0);
4564 0 : ret = __btrfs_alloc_chunk(trans, extent_root, chunk_offset,
4565 : alloc_profile);
4566 0 : if (ret)
4567 : return ret;
4568 :
4569 0 : sys_chunk_offset = find_next_chunk(root->fs_info);
4570 0 : alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0);
4571 0 : ret = __btrfs_alloc_chunk(trans, extent_root, sys_chunk_offset,
4572 : alloc_profile);
4573 0 : if (ret) {
4574 0 : btrfs_abort_transaction(trans, root, ret);
4575 0 : goto out;
4576 : }
4577 :
4578 0 : ret = btrfs_add_device(trans, fs_info->chunk_root, device);
4579 0 : if (ret)
4580 0 : btrfs_abort_transaction(trans, root, ret);
4581 : out:
4582 0 : return ret;
4583 : }
4584 :
4585 1141 : int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
4586 : {
4587 : struct extent_map *em;
4588 : struct map_lookup *map;
4589 1141 : struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
4590 : int readonly = 0;
4591 : int i;
4592 :
4593 1141 : read_lock(&map_tree->map_tree.lock);
4594 1141 : em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
4595 : read_unlock(&map_tree->map_tree.lock);
4596 1141 : if (!em)
4597 : return 1;
4598 :
4599 1141 : if (btrfs_test_opt(root, DEGRADED)) {
4600 0 : free_extent_map(em);
4601 0 : return 0;
4602 : }
4603 :
4604 1141 : map = (struct map_lookup *)em->bdev;
4605 2757 : for (i = 0; i < map->num_stripes; i++) {
4606 1616 : if (!map->stripes[i].dev->writeable) {
4607 : readonly = 1;
4608 : break;
4609 : }
4610 : }
4611 1141 : free_extent_map(em);
4612 1141 : return readonly;
4613 : }
4614 :
4615 221 : void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
4616 : {
4617 221 : extent_map_tree_init(&tree->map_tree);
4618 221 : }
4619 :
4620 221 : void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
4621 : {
4622 : struct extent_map *em;
4623 :
4624 : while (1) {
4625 1377 : write_lock(&tree->map_tree.lock);
4626 1377 : em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
4627 1377 : if (em)
4628 1156 : remove_extent_mapping(&tree->map_tree, em);
4629 : write_unlock(&tree->map_tree.lock);
4630 1377 : if (!em)
4631 : break;
4632 : /* once for us */
4633 1156 : free_extent_map(em);
4634 : /* once for the tree */
4635 1156 : free_extent_map(em);
4636 1156 : }
4637 221 : }
4638 :
4639 0 : int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
4640 : {
4641 : struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
4642 : struct extent_map *em;
4643 : struct map_lookup *map;
4644 0 : struct extent_map_tree *em_tree = &map_tree->map_tree;
4645 : int ret;
4646 :
4647 0 : read_lock(&em_tree->lock);
4648 0 : em = lookup_extent_mapping(em_tree, logical, len);
4649 : read_unlock(&em_tree->lock);
4650 :
4651 : /*
4652 : * We could return errors for these cases, but that could get ugly and
4653 : * we'd probably do the same thing which is just not do anything else
4654 : * and exit, so return 1 so the callers don't try to use other copies.
4655 : */
4656 0 : if (!em) {
4657 0 : btrfs_crit(fs_info, "No mapping for %Lu-%Lu", logical,
4658 : logical+len);
4659 0 : return 1;
4660 : }
4661 :
4662 0 : if (em->start > logical || em->start + em->len < logical) {
4663 0 : btrfs_crit(fs_info, "Invalid mapping for %Lu-%Lu, got "
4664 : "%Lu-%Lu", logical, logical+len, em->start,
4665 : em->start + em->len);
4666 0 : free_extent_map(em);
4667 0 : return 1;
4668 : }
4669 :
4670 0 : map = (struct map_lookup *)em->bdev;
4671 0 : if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
4672 0 : ret = map->num_stripes;
4673 0 : else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
4674 0 : ret = map->sub_stripes;
4675 0 : else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
4676 : ret = 2;
4677 0 : else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
4678 : ret = 3;
4679 : else
4680 : ret = 1;
4681 0 : free_extent_map(em);
4682 :
4683 0 : btrfs_dev_replace_lock(&fs_info->dev_replace);
4684 0 : if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))
4685 0 : ret++;
4686 0 : btrfs_dev_replace_unlock(&fs_info->dev_replace);
4687 :
4688 0 : return ret;
4689 : }
4690 :
4691 1228 : unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
4692 : struct btrfs_mapping_tree *map_tree,
4693 : u64 logical)
4694 : {
4695 : struct extent_map *em;
4696 : struct map_lookup *map;
4697 1228 : struct extent_map_tree *em_tree = &map_tree->map_tree;
4698 1228 : unsigned long len = root->sectorsize;
4699 :
4700 1228 : read_lock(&em_tree->lock);
4701 1228 : em = lookup_extent_mapping(em_tree, logical, len);
4702 : read_unlock(&em_tree->lock);
4703 1228 : BUG_ON(!em);
4704 :
4705 1228 : BUG_ON(em->start > logical || em->start + em->len < logical);
4706 1228 : map = (struct map_lookup *)em->bdev;
4707 1228 : if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
4708 : BTRFS_BLOCK_GROUP_RAID6)) {
4709 12 : len = map->stripe_len * nr_data_stripes(map);
4710 : }
4711 1228 : free_extent_map(em);
4712 1228 : return len;
4713 : }
4714 :
4715 0 : int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
4716 : u64 logical, u64 len, int mirror_num)
4717 : {
4718 : struct extent_map *em;
4719 : struct map_lookup *map;
4720 0 : struct extent_map_tree *em_tree = &map_tree->map_tree;
4721 : int ret = 0;
4722 :
4723 0 : read_lock(&em_tree->lock);
4724 0 : em = lookup_extent_mapping(em_tree, logical, len);
4725 : read_unlock(&em_tree->lock);
4726 0 : BUG_ON(!em);
4727 :
4728 0 : BUG_ON(em->start > logical || em->start + em->len < logical);
4729 0 : map = (struct map_lookup *)em->bdev;
4730 0 : if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
4731 : BTRFS_BLOCK_GROUP_RAID6))
4732 : ret = 1;
4733 0 : free_extent_map(em);
4734 0 : return ret;
4735 : }
4736 :
4737 70116 : static int find_live_mirror(struct btrfs_fs_info *fs_info,
4738 : struct map_lookup *map, int first, int num,
4739 : int optimal, int dev_replace_is_ongoing)
4740 : {
4741 : int i;
4742 : int tolerance;
4743 : struct btrfs_device *srcdev;
4744 :
4745 140126 : if (dev_replace_is_ongoing &&
4746 70010 : fs_info->dev_replace.cont_reading_from_srcdev_mode ==
4747 : BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
4748 0 : srcdev = fs_info->dev_replace.srcdev;
4749 : else
4750 : srcdev = NULL;
4751 :
4752 : /*
4753 : * try to avoid the drive that is the source drive for a
4754 : * dev-replace procedure, only choose it if no other non-missing
4755 : * mirror is available
4756 : */
4757 0 : for (tolerance = 0; tolerance < 2; tolerance++) {
4758 70116 : if (map->stripes[optimal].dev->bdev &&
4759 70116 : (tolerance || map->stripes[optimal].dev != srcdev))
4760 : return optimal;
4761 0 : for (i = first; i < first + num; i++) {
4762 0 : if (map->stripes[i].dev->bdev &&
4763 0 : (tolerance || map->stripes[i].dev != srcdev))
4764 : return i;
4765 : }
4766 : }
4767 :
4768 : /* we couldn't find one that doesn't fail. Just return something
4769 : * and the io error handling code will clean up eventually
4770 : */
4771 : return optimal;
4772 : }
4773 :
4774 : static inline int parity_smaller(u64 a, u64 b)
4775 : {
4776 : return a > b;
4777 : }
4778 :
4779 : /* Bubble-sort the stripe set to put the parity/syndrome stripes last */
4780 38 : static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map)
4781 : {
4782 : struct btrfs_bio_stripe s;
4783 : int i;
4784 : u64 l;
4785 : int again = 1;
4786 :
4787 165 : while (again) {
4788 : again = 0;
4789 267 : for (i = 0; i < bbio->num_stripes - 1; i++) {
4790 267 : if (parity_smaller(raid_map[i], raid_map[i+1])) {
4791 89 : s = bbio->stripes[i];
4792 : l = raid_map[i];
4793 89 : bbio->stripes[i] = bbio->stripes[i+1];
4794 89 : raid_map[i] = raid_map[i+1];
4795 89 : bbio->stripes[i+1] = s;
4796 89 : raid_map[i+1] = l;
4797 : again = 1;
4798 : }
4799 : }
4800 : }
4801 38 : }
4802 :
4803 1770950 : static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4804 : u64 logical, u64 *length,
4805 : struct btrfs_bio **bbio_ret,
4806 : int mirror_num, u64 **raid_map_ret)
4807 : {
4808 : struct extent_map *em;
4809 : struct map_lookup *map;
4810 : struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
4811 1770950 : struct extent_map_tree *em_tree = &map_tree->map_tree;
4812 : u64 offset;
4813 : u64 stripe_offset;
4814 : u64 stripe_end_offset;
4815 : u64 stripe_nr;
4816 : u64 stripe_nr_orig;
4817 : u64 stripe_nr_end;
4818 : u64 stripe_len;
4819 : u64 *raid_map = NULL;
4820 : int stripe_index;
4821 : int i;
4822 : int ret = 0;
4823 : int num_stripes;
4824 : int max_errors = 0;
4825 : struct btrfs_bio *bbio = NULL;
4826 1770950 : struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
4827 : int dev_replace_is_ongoing = 0;
4828 : int num_alloc_stripes;
4829 : int patch_the_first_stripe_for_dev_replace = 0;
4830 : u64 physical_to_patch_in_first_stripe = 0;
4831 : u64 raid56_full_stripe_start = (u64)-1;
4832 :
4833 1770950 : read_lock(&em_tree->lock);
4834 1771455 : em = lookup_extent_mapping(em_tree, logical, *length);
4835 : read_unlock(&em_tree->lock);
4836 :
4837 1772091 : if (!em) {
4838 0 : btrfs_crit(fs_info, "unable to find logical %llu len %llu",
4839 : logical, *length);
4840 0 : return -EINVAL;
4841 : }
4842 :
4843 1772091 : if (em->start > logical || em->start + em->len < logical) {
4844 0 : btrfs_crit(fs_info, "found a bad mapping, wanted %Lu, "
4845 : "found %Lu-%Lu", logical, em->start,
4846 : em->start + em->len);
4847 0 : free_extent_map(em);
4848 0 : return -EINVAL;
4849 : }
4850 :
4851 1772139 : map = (struct map_lookup *)em->bdev;
4852 1772139 : offset = logical - em->start;
4853 :
4854 1772139 : stripe_len = map->stripe_len;
4855 : stripe_nr = offset;
4856 : /*
4857 : * stripe_nr counts the total number of stripes we have to stride
4858 : * to get to this block
4859 : */
4860 1772139 : do_div(stripe_nr, stripe_len);
4861 :
4862 1772139 : stripe_offset = stripe_nr * stripe_len;
4863 1772139 : BUG_ON(offset < stripe_offset);
4864 :
4865 : /* stripe_offset is the offset of this block in its stripe*/
4866 1772139 : stripe_offset = offset - stripe_offset;
4867 :
4868 : /* if we're here for raid56, we need to know the stripe aligned start */
4869 1772139 : if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
4870 1418 : unsigned long full_stripe_len = stripe_len * nr_data_stripes(map);
4871 : raid56_full_stripe_start = offset;
4872 :
4873 : /* allow a write of a full stripe, but make sure we don't
4874 : * allow straddling of stripes
4875 : */
4876 1418 : do_div(raid56_full_stripe_start, full_stripe_len);
4877 1418 : raid56_full_stripe_start *= full_stripe_len;
4878 : }
4879 :
4880 1772139 : if (rw & REQ_DISCARD) {
4881 : /* we don't discard raid56 yet */
4882 0 : if (map->type &
4883 : (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
4884 : ret = -EOPNOTSUPP;
4885 : goto out;
4886 : }
4887 0 : *length = min_t(u64, em->len - offset, *length);
4888 1772139 : } else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
4889 : u64 max_len;
4890 : /* For writes to RAID[56], allow a full stripeset across all disks.
4891 : For other RAID types and for RAID[56] reads, just allow a single
4892 : stripe (on a single disk). */
4893 972711 : if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) &&
4894 1418 : (rw & REQ_WRITE)) {
4895 2678 : max_len = stripe_len * nr_data_stripes(map) -
4896 1339 : (offset - raid56_full_stripe_start);
4897 : } else {
4898 : /* we limit the length of each bio to what fits in a stripe */
4899 969954 : max_len = stripe_len - stripe_offset;
4900 : }
4901 971293 : *length = min_t(u64, em->len - offset, max_len);
4902 : } else {
4903 800846 : *length = em->len - offset;
4904 : }
4905 :
4906 : /* This is for when we're called from btrfs_merge_bio_hook() and all
4907 : it cares about is the length */
4908 1772139 : if (!bbio_ret)
4909 : goto out;
4910 :
4911 251815 : btrfs_dev_replace_lock(dev_replace);
4912 251823 : dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
4913 251823 : if (!dev_replace_is_ongoing)
4914 157988 : btrfs_dev_replace_unlock(dev_replace);
4915 :
4916 251821 : if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
4917 0 : !(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) &&
4918 0 : dev_replace->tgtdev != NULL) {
4919 : /*
4920 : * in dev-replace case, for repair case (that's the only
4921 : * case where the mirror is selected explicitly when
4922 : * calling btrfs_map_block), blocks left of the left cursor
4923 : * can also be read from the target drive.
4924 : * For REQ_GET_READ_MIRRORS, the target drive is added as
4925 : * the last one to the array of stripes. For READ, it also
4926 : * needs to be supported using the same mirror number.
4927 : * If the requested block is not left of the left cursor,
4928 : * EIO is returned. This can happen because btrfs_num_copies()
4929 : * returns one more in the dev-replace case.
4930 : */
4931 0 : u64 tmp_length = *length;
4932 0 : struct btrfs_bio *tmp_bbio = NULL;
4933 : int tmp_num_stripes;
4934 0 : u64 srcdev_devid = dev_replace->srcdev->devid;
4935 : int index_srcdev = 0;
4936 : int found = 0;
4937 : u64 physical_of_found = 0;
4938 :
4939 0 : ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS,
4940 : logical, &tmp_length, &tmp_bbio, 0, NULL);
4941 0 : if (ret) {
4942 0 : WARN_ON(tmp_bbio != NULL);
4943 0 : goto out;
4944 : }
4945 :
4946 0 : tmp_num_stripes = tmp_bbio->num_stripes;
4947 0 : if (mirror_num > tmp_num_stripes) {
4948 : /*
4949 : * REQ_GET_READ_MIRRORS does not contain this
4950 : * mirror, that means that the requested area
4951 : * is not left of the left cursor
4952 : */
4953 : ret = -EIO;
4954 0 : kfree(tmp_bbio);
4955 0 : goto out;
4956 : }
4957 :
4958 : /*
4959 : * process the rest of the function using the mirror_num
4960 : * of the source drive. Therefore look it up first.
4961 : * At the end, patch the device pointer to the one of the
4962 : * target drive.
4963 : */
4964 0 : for (i = 0; i < tmp_num_stripes; i++) {
4965 0 : if (tmp_bbio->stripes[i].dev->devid == srcdev_devid) {
4966 : /*
4967 : * In case of DUP, in order to keep it
4968 : * simple, only add the mirror with the
4969 : * lowest physical address
4970 : */
4971 0 : if (found &&
4972 : physical_of_found <=
4973 0 : tmp_bbio->stripes[i].physical)
4974 0 : continue;
4975 : index_srcdev = i;
4976 : found = 1;
4977 0 : physical_of_found =
4978 : tmp_bbio->stripes[i].physical;
4979 : }
4980 : }
4981 :
4982 0 : if (found) {
4983 0 : mirror_num = index_srcdev + 1;
4984 : patch_the_first_stripe_for_dev_replace = 1;
4985 : physical_to_patch_in_first_stripe = physical_of_found;
4986 : } else {
4987 0 : WARN_ON(1);
4988 : ret = -EIO;
4989 0 : kfree(tmp_bbio);
4990 0 : goto out;
4991 : }
4992 :
4993 0 : kfree(tmp_bbio);
4994 251821 : } else if (mirror_num > map->num_stripes) {
4995 : mirror_num = 0;
4996 : }
4997 :
4998 : num_stripes = 1;
4999 : stripe_index = 0;
5000 : stripe_nr_orig = stripe_nr;
5001 251821 : stripe_nr_end = ALIGN(offset + *length, map->stripe_len);
5002 251821 : do_div(stripe_nr_end, map->stripe_len);
5003 251821 : stripe_end_offset = stripe_nr_end * map->stripe_len -
5004 : (offset + *length);
5005 :
5006 251821 : if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
5007 3359 : if (rw & REQ_DISCARD)
5008 0 : num_stripes = min_t(u64, map->num_stripes,
5009 : stripe_nr_end - stripe_nr_orig);
5010 3359 : stripe_index = do_div(stripe_nr, map->num_stripes);
5011 248462 : } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
5012 111765 : if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS))
5013 41448 : num_stripes = map->num_stripes;
5014 70317 : else if (mirror_num)
5015 208 : stripe_index = mirror_num - 1;
5016 : else {
5017 70109 : stripe_index = find_live_mirror(fs_info, map, 0,
5018 : map->num_stripes,
5019 70109 : current->pid % map->num_stripes,
5020 : dev_replace_is_ongoing);
5021 70109 : mirror_num = stripe_index + 1;
5022 : }
5023 :
5024 136697 : } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
5025 29813 : if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) {
5026 21348 : num_stripes = map->num_stripes;
5027 8465 : } else if (mirror_num) {
5028 0 : stripe_index = mirror_num - 1;
5029 : } else {
5030 : mirror_num = 1;
5031 : }
5032 :
5033 106884 : } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
5034 43 : int factor = map->num_stripes / map->sub_stripes;
5035 :
5036 43 : stripe_index = do_div(stripe_nr, factor);
5037 43 : stripe_index *= map->sub_stripes;
5038 :
5039 43 : if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
5040 : num_stripes = map->sub_stripes;
5041 7 : else if (rw & REQ_DISCARD)
5042 0 : num_stripes = min_t(u64, map->sub_stripes *
5043 : (stripe_nr_end - stripe_nr_orig),
5044 : map->num_stripes);
5045 7 : else if (mirror_num)
5046 0 : stripe_index += mirror_num - 1;
5047 : else {
5048 : int old_stripe_index = stripe_index;
5049 7 : stripe_index = find_live_mirror(fs_info, map,
5050 : stripe_index,
5051 : map->sub_stripes, stripe_index +
5052 7 : current->pid % map->sub_stripes,
5053 : dev_replace_is_ongoing);
5054 7 : mirror_num = stripe_index - old_stripe_index + 1;
5055 : }
5056 :
5057 106841 : } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
5058 : BTRFS_BLOCK_GROUP_RAID6)) {
5059 : u64 tmp;
5060 :
5061 52 : if (bbio_ret && ((rw & REQ_WRITE) || mirror_num > 1)
5062 76 : && raid_map_ret) {
5063 : int i, rot;
5064 :
5065 : /* push stripe_nr back to the start of the full stripe */
5066 : stripe_nr = raid56_full_stripe_start;
5067 38 : do_div(stripe_nr, stripe_len);
5068 :
5069 38 : stripe_index = do_div(stripe_nr, nr_data_stripes(map));
5070 :
5071 : /* RAID[56] write or recovery. Return all stripes */
5072 : num_stripes = map->num_stripes;
5073 : max_errors = nr_parity_stripes(map);
5074 :
5075 38 : raid_map = kmalloc_array(num_stripes, sizeof(u64),
5076 : GFP_NOFS);
5077 0 : if (!raid_map) {
5078 : ret = -ENOMEM;
5079 : goto out;
5080 : }
5081 :
5082 : /* Work out the disk rotation on this stripe-set */
5083 : tmp = stripe_nr;
5084 38 : rot = do_div(tmp, num_stripes);
5085 :
5086 : /* Fill in the logical address of each stripe */
5087 38 : tmp = stripe_nr * nr_data_stripes(map);
5088 266 : for (i = 0; i < nr_data_stripes(map); i++)
5089 190 : raid_map[(i+rot) % num_stripes] =
5090 95 : em->start + (tmp + i) * map->stripe_len;
5091 :
5092 38 : raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
5093 38 : if (map->type & BTRFS_BLOCK_GROUP_RAID6)
5094 19 : raid_map[(i+rot+1) % num_stripes] =
5095 : RAID6_Q_STRIPE;
5096 :
5097 38 : *length = map->stripe_len;
5098 : stripe_index = 0;
5099 : stripe_offset = 0;
5100 : } else {
5101 : /*
5102 : * Mirror #0 or #1 means the original data block.
5103 : * Mirror #2 is RAID5 parity block.
5104 : * Mirror #3 is RAID6 Q block.
5105 : */
5106 14 : stripe_index = do_div(stripe_nr, nr_data_stripes(map));
5107 14 : if (mirror_num > 1)
5108 0 : stripe_index = nr_data_stripes(map) +
5109 : mirror_num - 2;
5110 :
5111 : /* We distribute the parity blocks across stripes */
5112 14 : tmp = stripe_nr + stripe_index;
5113 14 : stripe_index = do_div(tmp, map->num_stripes);
5114 : }
5115 : } else {
5116 : /*
5117 : * after this do_div call, stripe_nr is the number of stripes
5118 : * on this device we have to walk to find the data, and
5119 : * stripe_index is the number of our device in the stripe array
5120 : */
5121 106789 : stripe_index = do_div(stripe_nr, map->num_stripes);
5122 106789 : mirror_num = stripe_index + 1;
5123 : }
5124 251821 : BUG_ON(stripe_index >= map->num_stripes);
5125 :
5126 : num_alloc_stripes = num_stripes;
5127 251821 : if (dev_replace_is_ongoing) {
5128 93835 : if (rw & (REQ_WRITE | REQ_DISCARD))
5129 4161 : num_alloc_stripes <<= 1;
5130 93835 : if (rw & REQ_GET_READ_MIRRORS)
5131 760 : num_alloc_stripes++;
5132 : }
5133 251821 : bbio = kzalloc(btrfs_bio_size(num_alloc_stripes), GFP_NOFS);
5134 251818 : if (!bbio) {
5135 0 : kfree(raid_map);
5136 : ret = -ENOMEM;
5137 0 : goto out;
5138 : }
5139 : atomic_set(&bbio->error, 0);
5140 :
5141 251818 : if (rw & REQ_DISCARD) {
5142 : int factor = 0;
5143 : int sub_stripes = 0;
5144 : u64 stripes_per_dev = 0;
5145 : u32 remaining_stripes = 0;
5146 : u32 last_stripe = 0;
5147 :
5148 0 : if (map->type &
5149 : (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
5150 0 : if (map->type & BTRFS_BLOCK_GROUP_RAID0)
5151 : sub_stripes = 1;
5152 : else
5153 0 : sub_stripes = map->sub_stripes;
5154 :
5155 0 : factor = map->num_stripes / sub_stripes;
5156 0 : stripes_per_dev = div_u64_rem(stripe_nr_end -
5157 : stripe_nr_orig,
5158 : factor,
5159 : &remaining_stripes);
5160 0 : div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
5161 0 : last_stripe *= sub_stripes;
5162 : }
5163 :
5164 0 : for (i = 0; i < num_stripes; i++) {
5165 0 : bbio->stripes[i].physical =
5166 0 : map->stripes[stripe_index].physical +
5167 0 : stripe_offset + stripe_nr * map->stripe_len;
5168 0 : bbio->stripes[i].dev = map->stripes[stripe_index].dev;
5169 :
5170 0 : if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
5171 : BTRFS_BLOCK_GROUP_RAID10)) {
5172 0 : bbio->stripes[i].length = stripes_per_dev *
5173 0 : map->stripe_len;
5174 :
5175 0 : if (i / sub_stripes < remaining_stripes)
5176 0 : bbio->stripes[i].length +=
5177 0 : map->stripe_len;
5178 :
5179 : /*
5180 : * Special for the first stripe and
5181 : * the last stripe:
5182 : *
5183 : * |-------|...|-------|
5184 : * |----------|
5185 : * off end_off
5186 : */
5187 0 : if (i < sub_stripes)
5188 0 : bbio->stripes[i].length -=
5189 : stripe_offset;
5190 :
5191 0 : if (stripe_index >= last_stripe &&
5192 0 : stripe_index <= (last_stripe +
5193 0 : sub_stripes - 1))
5194 0 : bbio->stripes[i].length -=
5195 : stripe_end_offset;
5196 :
5197 0 : if (i == sub_stripes - 1)
5198 : stripe_offset = 0;
5199 : } else
5200 0 : bbio->stripes[i].length = *length;
5201 :
5202 0 : stripe_index++;
5203 0 : if (stripe_index == map->num_stripes) {
5204 : /* This could only happen for RAID0/10 */
5205 : stripe_index = 0;
5206 0 : stripe_nr++;
5207 : }
5208 : }
5209 : } else {
5210 314763 : for (i = 0; i < num_stripes; i++) {
5211 314763 : bbio->stripes[i].physical =
5212 314763 : map->stripes[stripe_index].physical +
5213 314763 : stripe_offset +
5214 314763 : stripe_nr * map->stripe_len;
5215 314763 : bbio->stripes[i].dev =
5216 314763 : map->stripes[stripe_index].dev;
5217 314763 : stripe_index++;
5218 : }
5219 : }
5220 :
5221 251818 : if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) {
5222 121001 : if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
5223 : BTRFS_BLOCK_GROUP_RAID10 |
5224 : BTRFS_BLOCK_GROUP_RAID5 |
5225 : BTRFS_BLOCK_GROUP_DUP)) {
5226 : max_errors = 1;
5227 58152 : } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
5228 : max_errors = 2;
5229 : }
5230 : }
5231 :
5232 255979 : if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) &&
5233 4161 : dev_replace->tgtdev != NULL) {
5234 : int index_where_to_add;
5235 4161 : u64 srcdev_devid = dev_replace->srcdev->devid;
5236 :
5237 : /*
5238 : * duplicate the write operations while the dev replace
5239 : * procedure is running. Since the copying of the old disk
5240 : * to the new disk takes place at run time while the
5241 : * filesystem is mounted writable, the regular write
5242 : * operations to the old disk have to be duplicated to go
5243 : * to the new disk as well.
5244 : * Note that device->missing is handled by the caller, and
5245 : * that the write to the old disk is already set up in the
5246 : * stripes array.
5247 : */
5248 : index_where_to_add = num_stripes;
5249 12316 : for (i = 0; i < num_stripes; i++) {
5250 8155 : if (bbio->stripes[i].dev->devid == srcdev_devid) {
5251 : /* write to new disk, too */
5252 4234 : struct btrfs_bio_stripe *new =
5253 4234 : bbio->stripes + index_where_to_add;
5254 4234 : struct btrfs_bio_stripe *old =
5255 4234 : bbio->stripes + i;
5256 :
5257 4234 : new->physical = old->physical;
5258 4234 : new->length = old->length;
5259 4234 : new->dev = dev_replace->tgtdev;
5260 4234 : index_where_to_add++;
5261 4234 : max_errors++;
5262 : }
5263 : }
5264 : num_stripes = index_where_to_add;
5265 248417 : } else if (dev_replace_is_ongoing && (rw & REQ_GET_READ_MIRRORS) &&
5266 760 : dev_replace->tgtdev != NULL) {
5267 760 : u64 srcdev_devid = dev_replace->srcdev->devid;
5268 : int index_srcdev = 0;
5269 : int found = 0;
5270 : u64 physical_of_found = 0;
5271 :
5272 : /*
5273 : * During the dev-replace procedure, the target drive can
5274 : * also be used to read data in case it is needed to repair
5275 : * a corrupt block elsewhere. This is possible if the
5276 : * requested area is left of the left cursor. In this area,
5277 : * the target drive is a full copy of the source drive.
5278 : */
5279 2158 : for (i = 0; i < num_stripes; i++) {
5280 1398 : if (bbio->stripes[i].dev->devid == srcdev_devid) {
5281 : /*
5282 : * In case of DUP, in order to keep it
5283 : * simple, only add the mirror with the
5284 : * lowest physical address
5285 : */
5286 1329 : if (found &&
5287 : physical_of_found <=
5288 289 : bbio->stripes[i].physical)
5289 289 : continue;
5290 : index_srcdev = i;
5291 : found = 1;
5292 751 : physical_of_found = bbio->stripes[i].physical;
5293 : }
5294 : }
5295 760 : if (found) {
5296 751 : u64 length = map->stripe_len;
5297 :
5298 1502 : if (physical_of_found + length <=
5299 751 : dev_replace->cursor_left) {
5300 508 : struct btrfs_bio_stripe *tgtdev_stripe =
5301 508 : bbio->stripes + num_stripes;
5302 :
5303 508 : tgtdev_stripe->physical = physical_of_found;
5304 508 : tgtdev_stripe->length =
5305 508 : bbio->stripes[index_srcdev].length;
5306 508 : tgtdev_stripe->dev = dev_replace->tgtdev;
5307 :
5308 508 : num_stripes++;
5309 : }
5310 : }
5311 : }
5312 :
5313 251818 : *bbio_ret = bbio;
5314 251818 : bbio->num_stripes = num_stripes;
5315 251818 : bbio->max_errors = max_errors;
5316 251818 : bbio->mirror_num = mirror_num;
5317 :
5318 : /*
5319 : * this is the case that REQ_READ && dev_replace_is_ongoing &&
5320 : * mirror_num == num_stripes + 1 && dev_replace target drive is
5321 : * available as a mirror
5322 : */
5323 251818 : if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
5324 0 : WARN_ON(num_stripes > 1);
5325 0 : bbio->stripes[0].dev = dev_replace->tgtdev;
5326 0 : bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
5327 0 : bbio->mirror_num = map->num_stripes + 1;
5328 : }
5329 251818 : if (raid_map) {
5330 38 : sort_parity_stripes(bbio, raid_map);
5331 38 : *raid_map_ret = raid_map;
5332 : }
5333 : out:
5334 1770717 : if (dev_replace_is_ongoing)
5335 93835 : btrfs_dev_replace_unlock(dev_replace);
5336 1770717 : free_extent_map(em);
5337 1772287 : return ret;
5338 : }
5339 :
5340 1624878 : int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5341 : u64 logical, u64 *length,
5342 : struct btrfs_bio **bbio_ret, int mirror_num)
5343 : {
5344 1624878 : return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
5345 : mirror_num, NULL);
5346 : }
5347 :
5348 3684 : int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
5349 : u64 chunk_start, u64 physical, u64 devid,
5350 : u64 **logical, int *naddrs, int *stripe_len)
5351 : {
5352 3684 : struct extent_map_tree *em_tree = &map_tree->map_tree;
5353 : struct extent_map *em;
5354 : struct map_lookup *map;
5355 : u64 *buf;
5356 : u64 bytenr;
5357 : u64 length;
5358 : u64 stripe_nr;
5359 : u64 rmap_len;
5360 : int i, j, nr = 0;
5361 :
5362 3684 : read_lock(&em_tree->lock);
5363 3684 : em = lookup_extent_mapping(em_tree, chunk_start, 1);
5364 : read_unlock(&em_tree->lock);
5365 :
5366 3684 : if (!em) {
5367 0 : printk(KERN_ERR "BTRFS: couldn't find em for chunk %Lu\n",
5368 : chunk_start);
5369 0 : return -EIO;
5370 : }
5371 :
5372 3684 : if (em->start != chunk_start) {
5373 0 : printk(KERN_ERR "BTRFS: bad chunk start, em=%Lu, wanted=%Lu\n",
5374 : em->start, chunk_start);
5375 0 : free_extent_map(em);
5376 0 : return -EIO;
5377 : }
5378 3684 : map = (struct map_lookup *)em->bdev;
5379 :
5380 3684 : length = em->len;
5381 3684 : rmap_len = map->stripe_len;
5382 :
5383 3684 : if (map->type & BTRFS_BLOCK_GROUP_RAID10)
5384 9 : do_div(length, map->num_stripes / map->sub_stripes);
5385 3675 : else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
5386 33 : do_div(length, map->num_stripes);
5387 3642 : else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
5388 : BTRFS_BLOCK_GROUP_RAID6)) {
5389 18 : do_div(length, nr_data_stripes(map));
5390 18 : rmap_len = map->stripe_len * nr_data_stripes(map);
5391 : }
5392 :
5393 3684 : buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
5394 3684 : BUG_ON(!buf); /* -ENOMEM */
5395 :
5396 5247 : for (i = 0; i < map->num_stripes; i++) {
5397 5247 : if (devid && map->stripes[i].dev->devid != devid)
5398 0 : continue;
5399 8555 : if (map->stripes[i].physical > physical ||
5400 3308 : map->stripes[i].physical + length <= physical)
5401 4769 : continue;
5402 :
5403 478 : stripe_nr = physical - map->stripes[i].physical;
5404 478 : do_div(stripe_nr, map->stripe_len);
5405 :
5406 478 : if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
5407 4 : stripe_nr = stripe_nr * map->num_stripes + i;
5408 4 : do_div(stripe_nr, map->sub_stripes);
5409 474 : } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
5410 12 : stripe_nr = stripe_nr * map->num_stripes + i;
5411 : } /* else if RAID[56], multiply by nr_data_stripes().
5412 : * Alternatively, just use rmap_len below instead of
5413 : * map->stripe_len */
5414 :
5415 478 : bytenr = chunk_start + stripe_nr * rmap_len;
5416 478 : WARN_ON(nr >= map->num_stripes);
5417 500 : for (j = 0; j < nr; j++) {
5418 28 : if (buf[j] == bytenr)
5419 : break;
5420 : }
5421 478 : if (j == nr) {
5422 472 : WARN_ON(nr >= map->num_stripes);
5423 472 : buf[nr++] = bytenr;
5424 : }
5425 : }
5426 :
5427 3684 : *logical = buf;
5428 3684 : *naddrs = nr;
5429 3684 : *stripe_len = rmap_len;
5430 :
5431 3684 : free_extent_map(em);
5432 3684 : return 0;
5433 : }
5434 :
5435 147560 : static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio, int err)
5436 : {
5437 147560 : if (likely(bbio->flags & BTRFS_BIO_ORIG_BIO_SUBMITTED))
5438 147560 : bio_endio_nodec(bio, err);
5439 : else
5440 0 : bio_endio(bio, err);
5441 147557 : kfree(bbio);
5442 147556 : }
5443 :
5444 212900 : static void btrfs_end_bio(struct bio *bio, int err)
5445 : {
5446 212900 : struct btrfs_bio *bbio = bio->bi_private;
5447 : struct btrfs_device *dev = bbio->stripes[0].dev;
5448 : int is_orig_bio = 0;
5449 :
5450 212900 : if (err) {
5451 0 : atomic_inc(&bbio->error);
5452 0 : if (err == -EIO || err == -EREMOTEIO) {
5453 0 : unsigned int stripe_index =
5454 0 : btrfs_io_bio(bio)->stripe_index;
5455 :
5456 0 : BUG_ON(stripe_index >= bbio->num_stripes);
5457 0 : dev = bbio->stripes[stripe_index].dev;
5458 0 : if (dev->bdev) {
5459 0 : if (bio->bi_rw & WRITE)
5460 : btrfs_dev_stat_inc(dev,
5461 : BTRFS_DEV_STAT_WRITE_ERRS);
5462 : else
5463 : btrfs_dev_stat_inc(dev,
5464 : BTRFS_DEV_STAT_READ_ERRS);
5465 0 : if ((bio->bi_rw & WRITE_FLUSH) == WRITE_FLUSH)
5466 : btrfs_dev_stat_inc(dev,
5467 : BTRFS_DEV_STAT_FLUSH_ERRS);
5468 0 : btrfs_dev_stat_print_on_error(dev);
5469 : }
5470 : }
5471 : }
5472 :
5473 212911 : if (bio == bbio->orig_bio)
5474 : is_orig_bio = 1;
5475 :
5476 212911 : btrfs_bio_counter_dec(bbio->fs_info);
5477 :
5478 425803 : if (atomic_dec_and_test(&bbio->stripes_pending)) {
5479 147558 : if (!is_orig_bio) {
5480 12216 : bio_put(bio);
5481 12216 : bio = bbio->orig_bio;
5482 : }
5483 :
5484 147558 : bio->bi_private = bbio->private;
5485 147558 : bio->bi_end_io = bbio->end_io;
5486 147558 : btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
5487 : /* only send an error to the higher layers if it is
5488 : * beyond the tolerance of the btrfs bio
5489 : */
5490 147558 : if (atomic_read(&bbio->error) > bbio->max_errors) {
5491 : err = -EIO;
5492 : } else {
5493 : /*
5494 : * this bio is actually up to date, we didn't
5495 : * go over the max number of errors
5496 : */
5497 : set_bit(BIO_UPTODATE, &bio->bi_flags);
5498 : err = 0;
5499 : }
5500 :
5501 147566 : btrfs_end_bbio(bbio, bio, err);
5502 65356 : } else if (!is_orig_bio) {
5503 53140 : bio_put(bio);
5504 : }
5505 212911 : }
5506 :
5507 : /*
5508 : * see run_scheduled_bios for a description of why bios are collected for
5509 : * async submit.
5510 : *
5511 : * This will add one bio to the pending list for a device and make sure
5512 : * the work struct is scheduled.
5513 : */
5514 105965 : static noinline void btrfs_schedule_bio(struct btrfs_root *root,
5515 : struct btrfs_device *device,
5516 : int rw, struct bio *bio)
5517 : {
5518 : int should_queue = 1;
5519 : struct btrfs_pending_bios *pending_bios;
5520 :
5521 105965 : if (device->missing || !device->bdev) {
5522 0 : bio_endio(bio, -EIO);
5523 : return;
5524 : }
5525 :
5526 : /* don't bother with additional async steps for reads, right now */
5527 105965 : if (!(rw & REQ_WRITE)) {
5528 0 : bio_get(bio);
5529 0 : btrfsic_submit_bio(rw, bio);
5530 0 : bio_put(bio);
5531 : return;
5532 : }
5533 :
5534 : /*
5535 : * nr_async_bios allows us to reliably return congestion to the
5536 : * higher layers. Otherwise, the async bio makes it appear we have
5537 : * made progress against dirty pages when we've really just put it
5538 : * on a queue for later
5539 : */
5540 105965 : atomic_inc(&root->fs_info->nr_async_bios);
5541 105966 : WARN_ON(bio->bi_next);
5542 105966 : bio->bi_next = NULL;
5543 105966 : bio->bi_rw |= rw;
5544 :
5545 : spin_lock(&device->io_lock);
5546 105966 : if (bio->bi_rw & REQ_SYNC)
5547 628 : pending_bios = &device->pending_sync_bios;
5548 : else
5549 105338 : pending_bios = &device->pending_bios;
5550 :
5551 105966 : if (pending_bios->tail)
5552 97958 : pending_bios->tail->bi_next = bio;
5553 :
5554 105966 : pending_bios->tail = bio;
5555 105966 : if (!pending_bios->head)
5556 8008 : pending_bios->head = bio;
5557 105966 : if (device->running_pending)
5558 : should_queue = 0;
5559 :
5560 : spin_unlock(&device->io_lock);
5561 :
5562 105966 : if (should_queue)
5563 36775 : btrfs_queue_work(root->fs_info->submit_workers,
5564 : &device->work);
5565 : }
5566 :
5567 212914 : static int bio_size_ok(struct block_device *bdev, struct bio *bio,
5568 : sector_t sector)
5569 : {
5570 : struct bio_vec *prev;
5571 212914 : struct request_queue *q = bdev_get_queue(bdev);
5572 : unsigned int max_sectors = queue_max_sectors(q);
5573 425828 : struct bvec_merge_data bvm = {
5574 : .bi_bdev = bdev,
5575 : .bi_sector = sector,
5576 212914 : .bi_rw = bio->bi_rw,
5577 : };
5578 :
5579 212914 : if (WARN_ON(bio->bi_vcnt == 0))
5580 : return 1;
5581 :
5582 212912 : prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
5583 212912 : if (bio_sectors(bio) > max_sectors)
5584 : return 0;
5585 :
5586 212912 : if (!q->merge_bvec_fn)
5587 : return 1;
5588 :
5589 0 : bvm.bi_size = bio->bi_iter.bi_size - prev->bv_len;
5590 0 : if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len)
5591 : return 0;
5592 0 : return 1;
5593 : }
5594 :
5595 212912 : static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
5596 : struct bio *bio, u64 physical, int dev_nr,
5597 : int rw, int async)
5598 : {
5599 212912 : struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
5600 :
5601 212912 : bio->bi_private = bbio;
5602 212912 : btrfs_io_bio(bio)->stripe_index = dev_nr;
5603 212912 : bio->bi_end_io = btrfs_end_bio;
5604 212912 : bio->bi_iter.bi_sector = physical >> 9;
5605 : #ifdef DEBUG
5606 : {
5607 : struct rcu_string *name;
5608 :
5609 : rcu_read_lock();
5610 : name = rcu_dereference(dev->name);
5611 : pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu "
5612 : "(%s id %llu), size=%u\n", rw,
5613 : (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
5614 : name->str, dev->devid, bio->bi_size);
5615 : rcu_read_unlock();
5616 : }
5617 : #endif
5618 212912 : bio->bi_bdev = dev->bdev;
5619 :
5620 212912 : btrfs_bio_counter_inc_noblocked(root->fs_info);
5621 :
5622 212910 : if (async)
5623 105964 : btrfs_schedule_bio(root, dev, rw, bio);
5624 : else
5625 106946 : btrfsic_submit_bio(rw, bio);
5626 212915 : }
5627 :
5628 0 : static int breakup_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
5629 : struct bio *first_bio, struct btrfs_device *dev,
5630 : int dev_nr, int rw, int async)
5631 : {
5632 0 : struct bio_vec *bvec = first_bio->bi_io_vec;
5633 : struct bio *bio;
5634 0 : int nr_vecs = bio_get_nr_vecs(dev->bdev);
5635 0 : u64 physical = bbio->stripes[dev_nr].physical;
5636 :
5637 : again:
5638 0 : bio = btrfs_bio_alloc(dev->bdev, physical >> 9, nr_vecs, GFP_NOFS);
5639 0 : if (!bio)
5640 : return -ENOMEM;
5641 :
5642 0 : while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) {
5643 0 : if (bio_add_page(bio, bvec->bv_page, bvec->bv_len,
5644 0 : bvec->bv_offset) < bvec->bv_len) {
5645 0 : u64 len = bio->bi_iter.bi_size;
5646 :
5647 0 : atomic_inc(&bbio->stripes_pending);
5648 0 : submit_stripe_bio(root, bbio, bio, physical, dev_nr,
5649 : rw, async);
5650 0 : physical += len;
5651 : goto again;
5652 : }
5653 0 : bvec++;
5654 : }
5655 :
5656 0 : submit_stripe_bio(root, bbio, bio, physical, dev_nr, rw, async);
5657 : return 0;
5658 : }
5659 :
5660 0 : static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
5661 : {
5662 0 : atomic_inc(&bbio->error);
5663 0 : if (atomic_dec_and_test(&bbio->stripes_pending)) {
5664 : /* Shoud be the original bio. */
5665 0 : WARN_ON(bio != bbio->orig_bio);
5666 :
5667 0 : bio->bi_private = bbio->private;
5668 0 : bio->bi_end_io = bbio->end_io;
5669 0 : btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
5670 0 : bio->bi_iter.bi_sector = logical >> 9;
5671 :
5672 0 : btrfs_end_bbio(bbio, bio, -EIO);
5673 : }
5674 0 : }
5675 :
5676 147591 : int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
5677 : int mirror_num, int async_submit)
5678 : {
5679 : struct btrfs_device *dev;
5680 : struct bio *first_bio = bio;
5681 147591 : u64 logical = (u64)bio->bi_iter.bi_sector << 9;
5682 : u64 length = 0;
5683 : u64 map_length;
5684 147591 : u64 *raid_map = NULL;
5685 : int ret;
5686 : int dev_nr = 0;
5687 : int total_devs = 1;
5688 147591 : struct btrfs_bio *bbio = NULL;
5689 :
5690 147591 : length = bio->bi_iter.bi_size;
5691 147591 : map_length = length;
5692 :
5693 147591 : btrfs_bio_counter_inc_blocked(root->fs_info);
5694 147590 : ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
5695 : mirror_num, &raid_map);
5696 147595 : if (ret) {
5697 0 : btrfs_bio_counter_dec(root->fs_info);
5698 0 : return ret;
5699 : }
5700 :
5701 147595 : total_devs = bbio->num_stripes;
5702 147595 : bbio->orig_bio = first_bio;
5703 147595 : bbio->private = first_bio->bi_private;
5704 147595 : bbio->end_io = first_bio->bi_end_io;
5705 147595 : bbio->fs_info = root->fs_info;
5706 : atomic_set(&bbio->stripes_pending, bbio->num_stripes);
5707 :
5708 147595 : if (raid_map) {
5709 : /* In this case, map_length has been set to the length of
5710 : a single stripe; not the whole write */
5711 38 : if (rw & WRITE) {
5712 38 : ret = raid56_parity_write(root, bio, bbio,
5713 : raid_map, map_length);
5714 : } else {
5715 0 : ret = raid56_parity_recover(root, bio, bbio,
5716 : raid_map, map_length,
5717 : mirror_num);
5718 : }
5719 : /*
5720 : * FIXME, replace dosen't support raid56 yet, please fix
5721 : * it in the future.
5722 : */
5723 38 : btrfs_bio_counter_dec(root->fs_info);
5724 38 : return ret;
5725 : }
5726 :
5727 147557 : if (map_length < length) {
5728 0 : btrfs_crit(root->fs_info, "mapping failed logical %llu bio len %llu len %llu",
5729 : logical, length, map_length);
5730 0 : BUG();
5731 : }
5732 :
5733 360471 : while (dev_nr < total_devs) {
5734 212912 : dev = bbio->stripes[dev_nr].dev;
5735 212912 : if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) {
5736 0 : bbio_error(bbio, first_bio, logical);
5737 0 : dev_nr++;
5738 0 : continue;
5739 : }
5740 :
5741 : /*
5742 : * Check and see if we're ok with this bio based on it's size
5743 : * and offset with the given device.
5744 : */
5745 212916 : if (!bio_size_ok(dev->bdev, first_bio,
5746 212916 : bbio->stripes[dev_nr].physical >> 9)) {
5747 0 : ret = breakup_stripe_bio(root, bbio, first_bio, dev,
5748 : dev_nr, rw, async_submit);
5749 0 : BUG_ON(ret);
5750 0 : dev_nr++;
5751 0 : continue;
5752 : }
5753 :
5754 212913 : if (dev_nr < total_devs - 1) {
5755 65356 : bio = btrfs_bio_clone(first_bio, GFP_NOFS);
5756 65356 : BUG_ON(!bio); /* -ENOMEM */
5757 : } else {
5758 : bio = first_bio;
5759 147557 : bbio->flags |= BTRFS_BIO_ORIG_BIO_SUBMITTED;
5760 : }
5761 :
5762 212913 : submit_stripe_bio(root, bbio, bio,
5763 212913 : bbio->stripes[dev_nr].physical, dev_nr, rw,
5764 : async_submit);
5765 212914 : dev_nr++;
5766 : }
5767 147559 : btrfs_bio_counter_dec(root->fs_info);
5768 147559 : return 0;
5769 : }
5770 :
5771 1973 : struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
5772 : u8 *uuid, u8 *fsid)
5773 : {
5774 : struct btrfs_device *device;
5775 : struct btrfs_fs_devices *cur_devices;
5776 :
5777 1973 : cur_devices = fs_info->fs_devices;
5778 3970 : while (cur_devices) {
5779 2229 : if (!fsid ||
5780 256 : !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
5781 1973 : device = __find_device(&cur_devices->devices,
5782 : devid, uuid);
5783 1973 : if (device)
5784 : return device;
5785 : }
5786 24 : cur_devices = cur_devices->seed;
5787 : }
5788 : return NULL;
5789 : }
5790 :
5791 0 : static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
5792 : u64 devid, u8 *dev_uuid)
5793 : {
5794 : struct btrfs_device *device;
5795 0 : struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
5796 :
5797 0 : device = btrfs_alloc_device(NULL, &devid, dev_uuid);
5798 0 : if (IS_ERR(device))
5799 : return NULL;
5800 :
5801 0 : list_add(&device->dev_list, &fs_devices->devices);
5802 0 : device->fs_devices = fs_devices;
5803 0 : fs_devices->num_devices++;
5804 :
5805 0 : device->missing = 1;
5806 0 : fs_devices->missing_devices++;
5807 :
5808 : return device;
5809 : }
5810 :
5811 : /**
5812 : * btrfs_alloc_device - allocate struct btrfs_device
5813 : * @fs_info: used only for generating a new devid, can be NULL if
5814 : * devid is provided (i.e. @devid != NULL).
5815 : * @devid: a pointer to devid for this device. If NULL a new devid
5816 : * is generated.
5817 : * @uuid: a pointer to UUID for this device. If NULL a new UUID
5818 : * is generated.
5819 : *
5820 : * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
5821 : * on error. Returned struct is not linked onto any lists and can be
5822 : * destroyed with kfree() right away.
5823 : */
5824 382 : struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
5825 : const u64 *devid,
5826 : const u8 *uuid)
5827 : {
5828 : struct btrfs_device *dev;
5829 : u64 tmp;
5830 :
5831 382 : if (WARN_ON(!devid && !fs_info))
5832 : return ERR_PTR(-EINVAL);
5833 :
5834 382 : dev = __alloc_device();
5835 382 : if (IS_ERR(dev))
5836 : return dev;
5837 :
5838 382 : if (devid)
5839 382 : tmp = *devid;
5840 : else {
5841 : int ret;
5842 :
5843 0 : ret = find_next_devid(fs_info, &tmp);
5844 0 : if (ret) {
5845 0 : kfree(dev);
5846 0 : return ERR_PTR(ret);
5847 : }
5848 : }
5849 382 : dev->devid = tmp;
5850 :
5851 382 : if (uuid)
5852 374 : memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE);
5853 : else
5854 8 : generate_random_uuid(dev->uuid);
5855 :
5856 382 : btrfs_init_work(&dev->work, btrfs_submit_helper,
5857 : pending_bios_fn, NULL, NULL);
5858 :
5859 382 : return dev;
5860 : }
5861 :
5862 1572 : static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
5863 : struct extent_buffer *leaf,
5864 : struct btrfs_chunk *chunk)
5865 : {
5866 1572 : struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
5867 : struct map_lookup *map;
5868 : struct extent_map *em;
5869 : u64 logical;
5870 : u64 length;
5871 : u64 devid;
5872 : u8 uuid[BTRFS_UUID_SIZE];
5873 : int num_stripes;
5874 : int ret;
5875 : int i;
5876 :
5877 1572 : logical = key->offset;
5878 : length = btrfs_chunk_length(leaf, chunk);
5879 :
5880 1572 : read_lock(&map_tree->map_tree.lock);
5881 1572 : em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
5882 : read_unlock(&map_tree->map_tree.lock);
5883 :
5884 : /* already mapped? */
5885 1572 : if (em && em->start <= logical && em->start + em->len > logical) {
5886 431 : free_extent_map(em);
5887 431 : return 0;
5888 1141 : } else if (em) {
5889 0 : free_extent_map(em);
5890 : }
5891 :
5892 1141 : em = alloc_extent_map();
5893 1141 : if (!em)
5894 : return -ENOMEM;
5895 1141 : num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
5896 1141 : map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
5897 1141 : if (!map) {
5898 0 : free_extent_map(em);
5899 0 : return -ENOMEM;
5900 : }
5901 :
5902 : set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
5903 1141 : em->bdev = (struct block_device *)map;
5904 1141 : em->start = logical;
5905 1141 : em->len = length;
5906 1141 : em->orig_start = 0;
5907 1141 : em->block_start = 0;
5908 1141 : em->block_len = em->len;
5909 :
5910 1141 : map->num_stripes = num_stripes;
5911 1141 : map->io_width = btrfs_chunk_io_width(leaf, chunk);
5912 1141 : map->io_align = btrfs_chunk_io_align(leaf, chunk);
5913 1141 : map->sector_size = btrfs_chunk_sector_size(leaf, chunk);
5914 1141 : map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
5915 1141 : map->type = btrfs_chunk_type(leaf, chunk);
5916 1141 : map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
5917 2757 : for (i = 0; i < num_stripes; i++) {
5918 1616 : map->stripes[i].physical =
5919 : btrfs_stripe_offset_nr(leaf, chunk, i);
5920 : devid = btrfs_stripe_devid_nr(leaf, chunk, i);
5921 1616 : read_extent_buffer(leaf, uuid, (unsigned long)
5922 1616 : btrfs_stripe_dev_uuid_nr(chunk, i),
5923 : BTRFS_UUID_SIZE);
5924 1616 : map->stripes[i].dev = btrfs_find_device(root->fs_info, devid,
5925 : uuid, NULL);
5926 1616 : if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
5927 0 : free_extent_map(em);
5928 0 : return -EIO;
5929 : }
5930 1616 : if (!map->stripes[i].dev) {
5931 0 : map->stripes[i].dev =
5932 0 : add_missing_dev(root, devid, uuid);
5933 0 : if (!map->stripes[i].dev) {
5934 0 : free_extent_map(em);
5935 0 : return -EIO;
5936 : }
5937 : }
5938 1616 : map->stripes[i].dev->in_fs_metadata = 1;
5939 : }
5940 :
5941 1141 : write_lock(&map_tree->map_tree.lock);
5942 1141 : ret = add_extent_mapping(&map_tree->map_tree, em, 0);
5943 : write_unlock(&map_tree->map_tree.lock);
5944 1141 : BUG_ON(ret); /* Tree corruption */
5945 1141 : free_extent_map(em);
5946 :
5947 1141 : return 0;
5948 : }
5949 :
5950 248 : static void fill_device_from_item(struct extent_buffer *leaf,
5951 : struct btrfs_dev_item *dev_item,
5952 : struct btrfs_device *device)
5953 : {
5954 : unsigned long ptr;
5955 :
5956 248 : device->devid = btrfs_device_id(leaf, dev_item);
5957 248 : device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
5958 248 : device->total_bytes = device->disk_total_bytes;
5959 248 : device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
5960 248 : device->type = btrfs_device_type(leaf, dev_item);
5961 248 : device->io_align = btrfs_device_io_align(leaf, dev_item);
5962 248 : device->io_width = btrfs_device_io_width(leaf, dev_item);
5963 248 : device->sector_size = btrfs_device_sector_size(leaf, dev_item);
5964 248 : WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
5965 248 : device->is_tgtdev_for_dev_replace = 0;
5966 :
5967 : ptr = btrfs_device_uuid(dev_item);
5968 248 : read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
5969 248 : }
5970 :
5971 0 : static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
5972 : {
5973 : struct btrfs_fs_devices *fs_devices;
5974 : int ret;
5975 :
5976 0 : BUG_ON(!mutex_is_locked(&uuid_mutex));
5977 :
5978 0 : fs_devices = root->fs_info->fs_devices->seed;
5979 0 : while (fs_devices) {
5980 0 : if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
5981 : ret = 0;
5982 : goto out;
5983 : }
5984 0 : fs_devices = fs_devices->seed;
5985 : }
5986 :
5987 0 : fs_devices = find_fsid(fsid);
5988 0 : if (!fs_devices) {
5989 : ret = -ENOENT;
5990 : goto out;
5991 : }
5992 :
5993 0 : fs_devices = clone_fs_devices(fs_devices);
5994 0 : if (IS_ERR(fs_devices)) {
5995 0 : ret = PTR_ERR(fs_devices);
5996 : goto out;
5997 : }
5998 :
5999 0 : ret = __btrfs_open_devices(fs_devices, FMODE_READ,
6000 0 : root->fs_info->bdev_holder);
6001 0 : if (ret) {
6002 0 : free_fs_devices(fs_devices);
6003 : goto out;
6004 : }
6005 :
6006 0 : if (!fs_devices->seeding) {
6007 0 : __btrfs_close_devices(fs_devices);
6008 0 : free_fs_devices(fs_devices);
6009 : ret = -EINVAL;
6010 : goto out;
6011 : }
6012 :
6013 0 : fs_devices->seed = root->fs_info->fs_devices->seed;
6014 0 : root->fs_info->fs_devices->seed = fs_devices;
6015 : out:
6016 0 : return ret;
6017 : }
6018 :
6019 248 : static int read_one_dev(struct btrfs_root *root,
6020 : struct extent_buffer *leaf,
6021 : struct btrfs_dev_item *dev_item)
6022 : {
6023 : struct btrfs_device *device;
6024 : u64 devid;
6025 : int ret;
6026 : u8 fs_uuid[BTRFS_UUID_SIZE];
6027 : u8 dev_uuid[BTRFS_UUID_SIZE];
6028 :
6029 : devid = btrfs_device_id(leaf, dev_item);
6030 248 : read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
6031 : BTRFS_UUID_SIZE);
6032 248 : read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
6033 : BTRFS_UUID_SIZE);
6034 :
6035 248 : if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) {
6036 0 : ret = open_seed_devices(root, fs_uuid);
6037 0 : if (ret && !btrfs_test_opt(root, DEGRADED))
6038 : return ret;
6039 : }
6040 :
6041 248 : device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid);
6042 248 : if (!device || !device->bdev) {
6043 0 : if (!btrfs_test_opt(root, DEGRADED))
6044 : return -EIO;
6045 :
6046 0 : if (!device) {
6047 0 : btrfs_warn(root->fs_info, "devid %llu missing", devid);
6048 0 : device = add_missing_dev(root, devid, dev_uuid);
6049 0 : if (!device)
6050 : return -ENOMEM;
6051 0 : } else if (!device->missing) {
6052 : /*
6053 : * this happens when a device that was properly setup
6054 : * in the device info lists suddenly goes bad.
6055 : * device->bdev is NULL, and so we have to set
6056 : * device->missing to one here
6057 : */
6058 0 : root->fs_info->fs_devices->missing_devices++;
6059 0 : device->missing = 1;
6060 : }
6061 : }
6062 :
6063 248 : if (device->fs_devices != root->fs_info->fs_devices) {
6064 0 : BUG_ON(device->writeable);
6065 0 : if (device->generation !=
6066 : btrfs_device_generation(leaf, dev_item))
6067 : return -EINVAL;
6068 : }
6069 :
6070 248 : fill_device_from_item(leaf, dev_item, device);
6071 248 : device->in_fs_metadata = 1;
6072 248 : if (device->writeable && !device->is_tgtdev_for_dev_replace) {
6073 248 : device->fs_devices->total_rw_bytes += device->total_bytes;
6074 248 : spin_lock(&root->fs_info->free_chunk_lock);
6075 496 : root->fs_info->free_chunk_space += device->total_bytes -
6076 248 : device->bytes_used;
6077 248 : spin_unlock(&root->fs_info->free_chunk_lock);
6078 : }
6079 : ret = 0;
6080 : return ret;
6081 : }
6082 :
6083 221 : int btrfs_read_sys_array(struct btrfs_root *root)
6084 : {
6085 221 : struct btrfs_super_block *super_copy = root->fs_info->super_copy;
6086 : struct extent_buffer *sb;
6087 : struct btrfs_disk_key *disk_key;
6088 : struct btrfs_chunk *chunk;
6089 : u8 *ptr;
6090 : unsigned long sb_ptr;
6091 : int ret = 0;
6092 : u32 num_stripes;
6093 : u32 array_size;
6094 : u32 len = 0;
6095 : u32 cur;
6096 : struct btrfs_key key;
6097 :
6098 221 : sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET,
6099 : BTRFS_SUPER_INFO_SIZE);
6100 221 : if (!sb)
6101 : return -ENOMEM;
6102 221 : btrfs_set_buffer_uptodate(sb);
6103 : btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
6104 : /*
6105 : * The sb extent buffer is artifical and just used to read the system array.
6106 : * btrfs_set_buffer_uptodate() call does not properly mark all it's
6107 : * pages up-to-date when the page is larger: extent does not cover the
6108 : * whole page and consequently check_page_uptodate does not find all
6109 : * the page's extents up-to-date (the hole beyond sb),
6110 : * write_extent_buffer then triggers a WARN_ON.
6111 : *
6112 : * Regular short extents go through mark_extent_buffer_dirty/writeback cycle,
6113 : * but sb spans only this function. Add an explicit SetPageUptodate call
6114 : * to silence the warning eg. on PowerPC 64.
6115 : */
6116 : if (PAGE_CACHE_SIZE > BTRFS_SUPER_INFO_SIZE)
6117 : SetPageUptodate(sb->pages[0]);
6118 :
6119 221 : write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
6120 : array_size = btrfs_super_sys_array_size(super_copy);
6121 :
6122 221 : ptr = super_copy->sys_chunk_array;
6123 : sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array);
6124 : cur = 0;
6125 :
6126 873 : while (cur < array_size) {
6127 : disk_key = (struct btrfs_disk_key *)ptr;
6128 : btrfs_disk_key_to_cpu(&key, disk_key);
6129 :
6130 431 : len = sizeof(*disk_key); ptr += len;
6131 431 : sb_ptr += len;
6132 431 : cur += len;
6133 :
6134 431 : if (key.type == BTRFS_CHUNK_ITEM_KEY) {
6135 431 : chunk = (struct btrfs_chunk *)sb_ptr;
6136 431 : ret = read_one_chunk(root, &key, sb, chunk);
6137 431 : if (ret)
6138 : break;
6139 : num_stripes = btrfs_chunk_num_stripes(sb, chunk);
6140 862 : len = btrfs_chunk_item_size(num_stripes);
6141 : } else {
6142 : ret = -EIO;
6143 : break;
6144 : }
6145 431 : ptr += len;
6146 431 : sb_ptr += len;
6147 431 : cur += len;
6148 : }
6149 221 : free_extent_buffer(sb);
6150 221 : return ret;
6151 : }
6152 :
6153 663 : int btrfs_read_chunk_tree(struct btrfs_root *root)
6154 : {
6155 : struct btrfs_path *path;
6156 1610 : struct extent_buffer *leaf;
6157 : struct btrfs_key key;
6158 : struct btrfs_key found_key;
6159 : int ret;
6160 : int slot;
6161 :
6162 221 : root = root->fs_info->chunk_root;
6163 :
6164 221 : path = btrfs_alloc_path();
6165 221 : if (!path)
6166 : return -ENOMEM;
6167 :
6168 221 : mutex_lock(&uuid_mutex);
6169 : lock_chunks(root);
6170 :
6171 : /*
6172 : * Read all device items, and then all the chunk items. All
6173 : * device items are found before any chunk item (their object id
6174 : * is smaller than the lowest possible object id for a chunk
6175 : * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID).
6176 : */
6177 221 : key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
6178 221 : key.offset = 0;
6179 221 : key.type = 0;
6180 221 : ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6181 221 : if (ret < 0)
6182 : goto error;
6183 : while (1) {
6184 1610 : leaf = path->nodes[0];
6185 1610 : slot = path->slots[0];
6186 3220 : if (slot >= btrfs_header_nritems(leaf)) {
6187 221 : ret = btrfs_next_leaf(root, path);
6188 221 : if (ret == 0)
6189 0 : continue;
6190 221 : if (ret < 0)
6191 : goto error;
6192 : break;
6193 : }
6194 1389 : btrfs_item_key_to_cpu(leaf, &found_key, slot);
6195 1389 : if (found_key.type == BTRFS_DEV_ITEM_KEY) {
6196 : struct btrfs_dev_item *dev_item;
6197 248 : dev_item = btrfs_item_ptr(leaf, slot,
6198 : struct btrfs_dev_item);
6199 248 : ret = read_one_dev(root, leaf, dev_item);
6200 248 : if (ret)
6201 : goto error;
6202 1141 : } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
6203 : struct btrfs_chunk *chunk;
6204 1141 : chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
6205 1141 : ret = read_one_chunk(root, &found_key, leaf, chunk);
6206 1141 : if (ret)
6207 : goto error;
6208 : }
6209 1389 : path->slots[0]++;
6210 : }
6211 : ret = 0;
6212 : error:
6213 : unlock_chunks(root);
6214 221 : mutex_unlock(&uuid_mutex);
6215 :
6216 221 : btrfs_free_path(path);
6217 221 : return ret;
6218 : }
6219 :
6220 221 : void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
6221 : {
6222 221 : struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6223 : struct btrfs_device *device;
6224 :
6225 663 : while (fs_devices) {
6226 221 : mutex_lock(&fs_devices->device_list_mutex);
6227 469 : list_for_each_entry(device, &fs_devices->devices, dev_list)
6228 248 : device->dev_root = fs_info->dev_root;
6229 221 : mutex_unlock(&fs_devices->device_list_mutex);
6230 :
6231 221 : fs_devices = fs_devices->seed;
6232 : }
6233 221 : }
6234 :
6235 : static void __btrfs_reset_dev_stats(struct btrfs_device *dev)
6236 : {
6237 : int i;
6238 :
6239 595 : for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
6240 : btrfs_dev_stat_reset(dev, i);
6241 : }
6242 :
6243 221 : int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
6244 : {
6245 : struct btrfs_key key;
6246 : struct btrfs_key found_key;
6247 221 : struct btrfs_root *dev_root = fs_info->dev_root;
6248 221 : struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6249 : struct extent_buffer *eb;
6250 : int slot;
6251 : int ret = 0;
6252 : struct btrfs_device *device;
6253 : struct btrfs_path *path = NULL;
6254 : int i;
6255 :
6256 221 : path = btrfs_alloc_path();
6257 221 : if (!path) {
6258 : ret = -ENOMEM;
6259 : goto out;
6260 : }
6261 :
6262 221 : mutex_lock(&fs_devices->device_list_mutex);
6263 469 : list_for_each_entry(device, &fs_devices->devices, dev_list) {
6264 : int item_size;
6265 : struct btrfs_dev_stats_item *ptr;
6266 :
6267 248 : key.objectid = 0;
6268 248 : key.type = BTRFS_DEV_STATS_KEY;
6269 248 : key.offset = device->devid;
6270 248 : ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
6271 248 : if (ret) {
6272 : __btrfs_reset_dev_stats(device);
6273 119 : device->dev_stats_valid = 1;
6274 119 : btrfs_release_path(path);
6275 119 : continue;
6276 : }
6277 129 : slot = path->slots[0];
6278 129 : eb = path->nodes[0];
6279 129 : btrfs_item_key_to_cpu(eb, &found_key, slot);
6280 129 : item_size = btrfs_item_size_nr(eb, slot);
6281 :
6282 129 : ptr = btrfs_item_ptr(eb, slot,
6283 : struct btrfs_dev_stats_item);
6284 :
6285 903 : for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
6286 645 : if (item_size >= (1 + i) * sizeof(__le64))
6287 : btrfs_dev_stat_set(device, i,
6288 : btrfs_dev_stats_value(eb, ptr, i));
6289 : else
6290 : btrfs_dev_stat_reset(device, i);
6291 : }
6292 :
6293 129 : device->dev_stats_valid = 1;
6294 129 : btrfs_dev_stat_print_on_load(device);
6295 129 : btrfs_release_path(path);
6296 : }
6297 221 : mutex_unlock(&fs_devices->device_list_mutex);
6298 :
6299 : out:
6300 221 : btrfs_free_path(path);
6301 221 : return ret < 0 ? ret : 0;
6302 : }
6303 :
6304 221 : static int update_dev_stat_item(struct btrfs_trans_handle *trans,
6305 : struct btrfs_root *dev_root,
6306 : struct btrfs_device *device)
6307 : {
6308 : struct btrfs_path *path;
6309 : struct btrfs_key key;
6310 : struct extent_buffer *eb;
6311 : struct btrfs_dev_stats_item *ptr;
6312 : int ret;
6313 : int i;
6314 :
6315 221 : key.objectid = 0;
6316 221 : key.type = BTRFS_DEV_STATS_KEY;
6317 221 : key.offset = device->devid;
6318 :
6319 221 : path = btrfs_alloc_path();
6320 221 : BUG_ON(!path);
6321 221 : ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
6322 221 : if (ret < 0) {
6323 0 : printk_in_rcu(KERN_WARNING "BTRFS: "
6324 : "error %d while searching for dev_stats item for device %s!\n",
6325 : ret, rcu_str_deref(device->name));
6326 : goto out;
6327 : }
6328 :
6329 323 : if (ret == 0 &&
6330 102 : btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
6331 : /* need to delete old one and insert a new one */
6332 : ret = btrfs_del_item(trans, dev_root, path);
6333 0 : if (ret != 0) {
6334 0 : printk_in_rcu(KERN_WARNING "BTRFS: "
6335 : "delete too small dev_stats item for device %s failed %d!\n",
6336 : rcu_str_deref(device->name), ret);
6337 : goto out;
6338 : }
6339 : ret = 1;
6340 : }
6341 :
6342 221 : if (ret == 1) {
6343 : /* need to insert a new item */
6344 119 : btrfs_release_path(path);
6345 : ret = btrfs_insert_empty_item(trans, dev_root, path,
6346 : &key, sizeof(*ptr));
6347 119 : if (ret < 0) {
6348 0 : printk_in_rcu(KERN_WARNING "BTRFS: "
6349 : "insert dev_stats item for device %s failed %d!\n",
6350 : rcu_str_deref(device->name), ret);
6351 : goto out;
6352 : }
6353 : }
6354 :
6355 221 : eb = path->nodes[0];
6356 442 : ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
6357 1326 : for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
6358 1105 : btrfs_set_dev_stats_value(eb, ptr, i,
6359 : btrfs_dev_stat_read(device, i));
6360 221 : btrfs_mark_buffer_dirty(eb);
6361 :
6362 : out:
6363 221 : btrfs_free_path(path);
6364 221 : return ret;
6365 : }
6366 :
6367 : /*
6368 : * called from commit_transaction. Writes all changed device stats to disk.
6369 : */
6370 2098 : int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
6371 : struct btrfs_fs_info *fs_info)
6372 : {
6373 2098 : struct btrfs_root *dev_root = fs_info->dev_root;
6374 2098 : struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6375 : struct btrfs_device *device;
6376 : int ret = 0;
6377 :
6378 2098 : mutex_lock(&fs_devices->device_list_mutex);
6379 4277 : list_for_each_entry(device, &fs_devices->devices, dev_list) {
6380 2179 : if (!device->dev_stats_valid || !device->dev_stats_dirty)
6381 1958 : continue;
6382 :
6383 221 : ret = update_dev_stat_item(trans, dev_root, device);
6384 221 : if (!ret)
6385 221 : device->dev_stats_dirty = 0;
6386 : }
6387 2098 : mutex_unlock(&fs_devices->device_list_mutex);
6388 :
6389 2098 : return ret;
6390 : }
6391 :
6392 0 : void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
6393 : {
6394 : btrfs_dev_stat_inc(dev, index);
6395 0 : btrfs_dev_stat_print_on_error(dev);
6396 0 : }
6397 :
6398 0 : static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
6399 : {
6400 0 : if (!dev->dev_stats_valid)
6401 0 : return;
6402 0 : printk_ratelimited_in_rcu(KERN_ERR "BTRFS: "
6403 : "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
6404 : rcu_str_deref(dev->name),
6405 : btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
6406 : btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
6407 : btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
6408 : btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
6409 : btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
6410 : }
6411 :
6412 129 : static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
6413 : {
6414 : int i;
6415 :
6416 774 : for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
6417 645 : if (btrfs_dev_stat_read(dev, i) != 0)
6418 : break;
6419 129 : if (i == BTRFS_DEV_STAT_VALUES_MAX)
6420 129 : return; /* all values == 0, suppress message */
6421 :
6422 0 : printk_in_rcu(KERN_INFO "BTRFS: "
6423 : "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
6424 : rcu_str_deref(dev->name),
6425 : btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
6426 : btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
6427 : btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
6428 : btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
6429 : btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
6430 : }
6431 :
6432 10 : int btrfs_get_dev_stats(struct btrfs_root *root,
6433 : struct btrfs_ioctl_get_dev_stats *stats)
6434 : {
6435 : struct btrfs_device *dev;
6436 10 : struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
6437 : int i;
6438 :
6439 10 : mutex_lock(&fs_devices->device_list_mutex);
6440 10 : dev = btrfs_find_device(root->fs_info, stats->devid, NULL, NULL);
6441 10 : mutex_unlock(&fs_devices->device_list_mutex);
6442 :
6443 10 : if (!dev) {
6444 3 : btrfs_warn(root->fs_info, "get dev_stats failed, device not found");
6445 3 : return -ENODEV;
6446 7 : } else if (!dev->dev_stats_valid) {
6447 0 : btrfs_warn(root->fs_info, "get dev_stats failed, not yet valid");
6448 0 : return -ENODEV;
6449 7 : } else if (stats->flags & BTRFS_DEV_STATS_RESET) {
6450 0 : for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
6451 0 : if (stats->nr_items > i)
6452 0 : stats->values[i] =
6453 : btrfs_dev_stat_read_and_reset(dev, i);
6454 : else
6455 : btrfs_dev_stat_reset(dev, i);
6456 : }
6457 : } else {
6458 35 : for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
6459 35 : if (stats->nr_items > i)
6460 35 : stats->values[i] = btrfs_dev_stat_read(dev, i);
6461 : }
6462 7 : if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
6463 0 : stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
6464 : return 0;
6465 : }
6466 :
6467 8 : int btrfs_scratch_superblock(struct btrfs_device *device)
6468 : {
6469 : struct buffer_head *bh;
6470 : struct btrfs_super_block *disk_super;
6471 :
6472 8 : bh = btrfs_read_dev_super(device->bdev);
6473 8 : if (!bh)
6474 : return -EINVAL;
6475 8 : disk_super = (struct btrfs_super_block *)bh->b_data;
6476 :
6477 8 : memset(&disk_super->magic, 0, sizeof(disk_super->magic));
6478 : set_buffer_dirty(bh);
6479 8 : sync_dirty_buffer(bh);
6480 : brelse(bh);
6481 :
6482 : return 0;
6483 : }
|