Line data Source code
1 : /*
2 : * Copyright (C) STRATO AG 2012. All rights reserved.
3 : *
4 : * This program is free software; you can redistribute it and/or
5 : * modify it under the terms of the GNU General Public
6 : * License v2 as published by the Free Software Foundation.
7 : *
8 : * This program is distributed in the hope that it will be useful,
9 : * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 : * General Public License for more details.
12 : *
13 : * You should have received a copy of the GNU General Public
14 : * License along with this program; if not, write to the
15 : * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 : * Boston, MA 021110-1307, USA.
17 : */
18 : #include <linux/sched.h>
19 : #include <linux/bio.h>
20 : #include <linux/slab.h>
21 : #include <linux/buffer_head.h>
22 : #include <linux/blkdev.h>
23 : #include <linux/random.h>
24 : #include <linux/iocontext.h>
25 : #include <linux/capability.h>
26 : #include <linux/kthread.h>
27 : #include <linux/math64.h>
28 : #include <asm/div64.h>
29 : #include "ctree.h"
30 : #include "extent_map.h"
31 : #include "disk-io.h"
32 : #include "transaction.h"
33 : #include "print-tree.h"
34 : #include "volumes.h"
35 : #include "async-thread.h"
36 : #include "check-integrity.h"
37 : #include "rcu-string.h"
38 : #include "dev-replace.h"
39 : #include "sysfs.h"
40 :
41 : static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
42 : int scrub_ret);
43 : static void btrfs_dev_replace_update_device_in_mapping_tree(
44 : struct btrfs_fs_info *fs_info,
45 : struct btrfs_device *srcdev,
46 : struct btrfs_device *tgtdev);
47 : static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid,
48 : char *srcdev_name,
49 : struct btrfs_device **device);
50 : static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
51 : static int btrfs_dev_replace_kthread(void *data);
52 : static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info);
53 :
54 :
55 221 : int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
56 : {
57 : struct btrfs_key key;
58 221 : struct btrfs_root *dev_root = fs_info->dev_root;
59 : struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
60 : struct extent_buffer *eb;
61 : int slot;
62 : int ret = 0;
63 : struct btrfs_path *path = NULL;
64 : int item_size;
65 : struct btrfs_dev_replace_item *ptr;
66 : u64 src_devid;
67 :
68 221 : path = btrfs_alloc_path();
69 221 : if (!path) {
70 : ret = -ENOMEM;
71 : goto out;
72 : }
73 :
74 221 : key.objectid = 0;
75 221 : key.type = BTRFS_DEV_REPLACE_KEY;
76 221 : key.offset = 0;
77 221 : ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
78 221 : if (ret) {
79 : no_valid_dev_replace_entry_found:
80 : ret = 0;
81 212 : dev_replace->replace_state =
82 : BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED;
83 212 : dev_replace->cont_reading_from_srcdev_mode =
84 : BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS;
85 : dev_replace->replace_state = 0;
86 212 : dev_replace->time_started = 0;
87 212 : dev_replace->time_stopped = 0;
88 : atomic64_set(&dev_replace->num_write_errors, 0);
89 : atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
90 212 : dev_replace->cursor_left = 0;
91 212 : dev_replace->committed_cursor_left = 0;
92 212 : dev_replace->cursor_left_last_write_of_item = 0;
93 212 : dev_replace->cursor_right = 0;
94 212 : dev_replace->srcdev = NULL;
95 212 : dev_replace->tgtdev = NULL;
96 212 : dev_replace->is_valid = 0;
97 212 : dev_replace->item_needs_writeback = 0;
98 212 : goto out;
99 : }
100 9 : slot = path->slots[0];
101 9 : eb = path->nodes[0];
102 : item_size = btrfs_item_size_nr(eb, slot);
103 9 : ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item);
104 :
105 9 : if (item_size != sizeof(struct btrfs_dev_replace_item)) {
106 0 : btrfs_warn(fs_info,
107 : "dev_replace entry found has unexpected size, ignore entry");
108 0 : goto no_valid_dev_replace_entry_found;
109 : }
110 :
111 : src_devid = btrfs_dev_replace_src_devid(eb, ptr);
112 9 : dev_replace->cont_reading_from_srcdev_mode =
113 : btrfs_dev_replace_cont_reading_from_srcdev_mode(eb, ptr);
114 9 : dev_replace->replace_state = btrfs_dev_replace_replace_state(eb, ptr);
115 9 : dev_replace->time_started = btrfs_dev_replace_time_started(eb, ptr);
116 9 : dev_replace->time_stopped =
117 : btrfs_dev_replace_time_stopped(eb, ptr);
118 9 : atomic64_set(&dev_replace->num_write_errors,
119 : btrfs_dev_replace_num_write_errors(eb, ptr));
120 9 : atomic64_set(&dev_replace->num_uncorrectable_read_errors,
121 : btrfs_dev_replace_num_uncorrectable_read_errors(eb, ptr));
122 9 : dev_replace->cursor_left = btrfs_dev_replace_cursor_left(eb, ptr);
123 9 : dev_replace->committed_cursor_left = dev_replace->cursor_left;
124 9 : dev_replace->cursor_left_last_write_of_item = dev_replace->cursor_left;
125 9 : dev_replace->cursor_right = btrfs_dev_replace_cursor_right(eb, ptr);
126 9 : dev_replace->is_valid = 1;
127 :
128 9 : dev_replace->item_needs_writeback = 0;
129 9 : switch (dev_replace->replace_state) {
130 : case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
131 : case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
132 : case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
133 9 : dev_replace->srcdev = NULL;
134 9 : dev_replace->tgtdev = NULL;
135 9 : break;
136 : case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
137 : case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
138 0 : dev_replace->srcdev = btrfs_find_device(fs_info, src_devid,
139 : NULL, NULL);
140 0 : dev_replace->tgtdev = btrfs_find_device(fs_info,
141 : BTRFS_DEV_REPLACE_DEVID,
142 : NULL, NULL);
143 : /*
144 : * allow 'btrfs dev replace_cancel' if src/tgt device is
145 : * missing
146 : */
147 0 : if (!dev_replace->srcdev &&
148 0 : !btrfs_test_opt(dev_root, DEGRADED)) {
149 : ret = -EIO;
150 0 : btrfs_warn(fs_info,
151 : "cannot mount because device replace operation is ongoing and");
152 0 : btrfs_warn(fs_info,
153 : "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?",
154 : src_devid);
155 : }
156 0 : if (!dev_replace->tgtdev &&
157 0 : !btrfs_test_opt(dev_root, DEGRADED)) {
158 : ret = -EIO;
159 0 : btrfs_warn(fs_info,
160 : "cannot mount because device replace operation is ongoing and");
161 0 : btrfs_warn(fs_info,
162 : "tgtdev (devid %llu) is missing, need to run 'btrfs dev scan'?",
163 : BTRFS_DEV_REPLACE_DEVID);
164 : }
165 0 : if (dev_replace->tgtdev) {
166 0 : if (dev_replace->srcdev) {
167 0 : dev_replace->tgtdev->total_bytes =
168 0 : dev_replace->srcdev->total_bytes;
169 0 : dev_replace->tgtdev->disk_total_bytes =
170 0 : dev_replace->srcdev->disk_total_bytes;
171 0 : dev_replace->tgtdev->bytes_used =
172 0 : dev_replace->srcdev->bytes_used;
173 : }
174 0 : dev_replace->tgtdev->is_tgtdev_for_dev_replace = 1;
175 0 : btrfs_init_dev_replace_tgtdev_for_resume(fs_info,
176 : dev_replace->tgtdev);
177 : }
178 : break;
179 : }
180 :
181 : out:
182 221 : if (path)
183 221 : btrfs_free_path(path);
184 221 : return ret;
185 : }
186 :
187 : /*
188 : * called from commit_transaction. Writes changed device replace state to
189 : * disk.
190 : */
191 2098 : int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
192 : struct btrfs_fs_info *fs_info)
193 : {
194 : int ret;
195 2098 : struct btrfs_root *dev_root = fs_info->dev_root;
196 : struct btrfs_path *path;
197 : struct btrfs_key key;
198 : struct extent_buffer *eb;
199 : struct btrfs_dev_replace_item *ptr;
200 2098 : struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
201 :
202 2098 : btrfs_dev_replace_lock(dev_replace);
203 2147 : if (!dev_replace->is_valid ||
204 49 : !dev_replace->item_needs_writeback) {
205 2065 : btrfs_dev_replace_unlock(dev_replace);
206 2065 : return 0;
207 : }
208 33 : btrfs_dev_replace_unlock(dev_replace);
209 :
210 33 : key.objectid = 0;
211 33 : key.type = BTRFS_DEV_REPLACE_KEY;
212 33 : key.offset = 0;
213 :
214 33 : path = btrfs_alloc_path();
215 33 : if (!path) {
216 : ret = -ENOMEM;
217 : goto out;
218 : }
219 33 : ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
220 33 : if (ret < 0) {
221 0 : btrfs_warn(fs_info, "error %d while searching for dev_replace item!",
222 : ret);
223 0 : goto out;
224 : }
225 :
226 59 : if (ret == 0 &&
227 26 : btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
228 : /*
229 : * need to delete old one and insert a new one.
230 : * Since no attempt is made to recover any old state, if the
231 : * dev_replace state is 'running', the data on the target
232 : * drive is lost.
233 : * It would be possible to recover the state: just make sure
234 : * that the beginning of the item is never changed and always
235 : * contains all the essential information. Then read this
236 : * minimal set of information and use it as a base for the
237 : * new state.
238 : */
239 : ret = btrfs_del_item(trans, dev_root, path);
240 0 : if (ret != 0) {
241 0 : btrfs_warn(fs_info, "delete too small dev_replace item failed %d!",
242 : ret);
243 0 : goto out;
244 : }
245 : ret = 1;
246 : }
247 :
248 33 : if (ret == 1) {
249 : /* need to insert a new item */
250 7 : btrfs_release_path(path);
251 : ret = btrfs_insert_empty_item(trans, dev_root, path,
252 : &key, sizeof(*ptr));
253 7 : if (ret < 0) {
254 0 : btrfs_warn(fs_info, "insert dev_replace item failed %d!",
255 : ret);
256 0 : goto out;
257 : }
258 : }
259 :
260 33 : eb = path->nodes[0];
261 66 : ptr = btrfs_item_ptr(eb, path->slots[0],
262 : struct btrfs_dev_replace_item);
263 :
264 33 : btrfs_dev_replace_lock(dev_replace);
265 33 : if (dev_replace->srcdev)
266 17 : btrfs_set_dev_replace_src_devid(eb, ptr,
267 : dev_replace->srcdev->devid);
268 : else
269 : btrfs_set_dev_replace_src_devid(eb, ptr, (u64)-1);
270 33 : btrfs_set_dev_replace_cont_reading_from_srcdev_mode(eb, ptr,
271 : dev_replace->cont_reading_from_srcdev_mode);
272 33 : btrfs_set_dev_replace_replace_state(eb, ptr,
273 : dev_replace->replace_state);
274 33 : btrfs_set_dev_replace_time_started(eb, ptr, dev_replace->time_started);
275 33 : btrfs_set_dev_replace_time_stopped(eb, ptr, dev_replace->time_stopped);
276 33 : btrfs_set_dev_replace_num_write_errors(eb, ptr,
277 : atomic64_read(&dev_replace->num_write_errors));
278 33 : btrfs_set_dev_replace_num_uncorrectable_read_errors(eb, ptr,
279 : atomic64_read(&dev_replace->num_uncorrectable_read_errors));
280 33 : dev_replace->cursor_left_last_write_of_item =
281 33 : dev_replace->cursor_left;
282 : btrfs_set_dev_replace_cursor_left(eb, ptr,
283 : dev_replace->cursor_left_last_write_of_item);
284 33 : btrfs_set_dev_replace_cursor_right(eb, ptr,
285 : dev_replace->cursor_right);
286 33 : dev_replace->item_needs_writeback = 0;
287 33 : btrfs_dev_replace_unlock(dev_replace);
288 :
289 33 : btrfs_mark_buffer_dirty(eb);
290 :
291 : out:
292 33 : btrfs_free_path(path);
293 :
294 33 : return ret;
295 : }
296 :
297 2098 : void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info)
298 : {
299 : struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
300 :
301 2098 : dev_replace->committed_cursor_left =
302 2098 : dev_replace->cursor_left_last_write_of_item;
303 2098 : }
304 :
305 8 : int btrfs_dev_replace_start(struct btrfs_root *root,
306 : struct btrfs_ioctl_dev_replace_args *args)
307 : {
308 : struct btrfs_trans_handle *trans;
309 16 : struct btrfs_fs_info *fs_info = root->fs_info;
310 8 : struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
311 : int ret;
312 8 : struct btrfs_device *tgt_device = NULL;
313 8 : struct btrfs_device *src_device = NULL;
314 :
315 8 : if (btrfs_fs_incompat(fs_info, RAID56)) {
316 0 : btrfs_warn(fs_info, "dev_replace cannot yet handle RAID5/RAID6");
317 0 : return -EOPNOTSUPP;
318 : }
319 :
320 8 : switch (args->start.cont_reading_from_srcdev_mode) {
321 : case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
322 : case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
323 : break;
324 : default:
325 : return -EINVAL;
326 : }
327 :
328 16 : if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') ||
329 8 : args->start.tgtdev_name[0] == '\0')
330 : return -EINVAL;
331 :
332 8 : mutex_lock(&fs_info->volume_mutex);
333 8 : ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name,
334 : &tgt_device);
335 8 : if (ret) {
336 0 : btrfs_err(fs_info, "target device %s is invalid!",
337 : args->start.tgtdev_name);
338 0 : mutex_unlock(&fs_info->volume_mutex);
339 0 : return -EINVAL;
340 : }
341 :
342 8 : ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid,
343 8 : args->start.srcdev_name,
344 : &src_device);
345 8 : mutex_unlock(&fs_info->volume_mutex);
346 8 : if (ret) {
347 : ret = -EINVAL;
348 : goto leave_no_lock;
349 : }
350 :
351 8 : if (tgt_device->total_bytes < src_device->total_bytes) {
352 0 : btrfs_err(fs_info, "target device is smaller than source device!");
353 : ret = -EINVAL;
354 0 : goto leave_no_lock;
355 : }
356 :
357 8 : btrfs_dev_replace_lock(dev_replace);
358 8 : switch (dev_replace->replace_state) {
359 : case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
360 : case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
361 : case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
362 : break;
363 : case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
364 : case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
365 0 : args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
366 0 : goto leave;
367 : }
368 :
369 8 : dev_replace->cont_reading_from_srcdev_mode =
370 8 : args->start.cont_reading_from_srcdev_mode;
371 8 : WARN_ON(!src_device);
372 8 : dev_replace->srcdev = src_device;
373 8 : WARN_ON(!tgt_device);
374 8 : dev_replace->tgtdev = tgt_device;
375 :
376 8 : printk_in_rcu(KERN_INFO
377 : "BTRFS: dev_replace from %s (devid %llu) to %s started\n",
378 : src_device->missing ? "<missing disk>" :
379 : rcu_str_deref(src_device->name),
380 : src_device->devid,
381 : rcu_str_deref(tgt_device->name));
382 :
383 8 : tgt_device->total_bytes = src_device->total_bytes;
384 8 : tgt_device->disk_total_bytes = src_device->disk_total_bytes;
385 8 : tgt_device->bytes_used = src_device->bytes_used;
386 :
387 : /*
388 : * from now on, the writes to the srcdev are all duplicated to
389 : * go to the tgtdev as well (refer to btrfs_map_block()).
390 : */
391 8 : dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
392 8 : dev_replace->time_started = get_seconds();
393 8 : dev_replace->cursor_left = 0;
394 8 : dev_replace->committed_cursor_left = 0;
395 8 : dev_replace->cursor_left_last_write_of_item = 0;
396 8 : dev_replace->cursor_right = 0;
397 8 : dev_replace->is_valid = 1;
398 8 : dev_replace->item_needs_writeback = 1;
399 8 : args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
400 8 : btrfs_dev_replace_unlock(dev_replace);
401 :
402 8 : btrfs_wait_ordered_roots(root->fs_info, -1);
403 :
404 : /* force writing the updated state information to disk */
405 8 : trans = btrfs_start_transaction(root, 0);
406 8 : if (IS_ERR(trans)) {
407 0 : ret = PTR_ERR(trans);
408 0 : btrfs_dev_replace_lock(dev_replace);
409 0 : goto leave;
410 : }
411 :
412 8 : ret = btrfs_commit_transaction(trans, root);
413 8 : WARN_ON(ret);
414 :
415 : /* the disk copy procedure reuses the scrub code */
416 16 : ret = btrfs_scrub_dev(fs_info, src_device->devid, 0,
417 8 : src_device->total_bytes,
418 : &dev_replace->scrub_progress, 0, 1);
419 :
420 8 : ret = btrfs_dev_replace_finishing(root->fs_info, ret);
421 8 : WARN_ON(ret);
422 :
423 : return 0;
424 :
425 : leave:
426 0 : dev_replace->srcdev = NULL;
427 0 : dev_replace->tgtdev = NULL;
428 0 : btrfs_dev_replace_unlock(dev_replace);
429 : leave_no_lock:
430 0 : if (tgt_device)
431 0 : btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
432 0 : return ret;
433 : }
434 :
435 : /*
436 : * blocked until all flighting bios are finished.
437 : */
438 7 : static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
439 : {
440 : s64 writers;
441 14 : DEFINE_WAIT(wait);
442 :
443 : set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
444 : do {
445 7 : prepare_to_wait(&fs_info->replace_wait, &wait,
446 : TASK_UNINTERRUPTIBLE);
447 7 : writers = percpu_counter_sum(&fs_info->bio_counter);
448 7 : if (writers)
449 0 : schedule();
450 7 : finish_wait(&fs_info->replace_wait, &wait);
451 7 : } while (writers);
452 7 : }
453 :
454 : /*
455 : * we have removed target device, it is safe to allow new bios request.
456 : */
457 7 : static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info)
458 : {
459 : clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
460 7 : if (waitqueue_active(&fs_info->replace_wait))
461 0 : wake_up(&fs_info->replace_wait);
462 7 : }
463 :
464 8 : static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
465 : int scrub_ret)
466 : {
467 8 : struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
468 : struct btrfs_device *tgt_device;
469 : struct btrfs_device *src_device;
470 8 : struct btrfs_root *root = fs_info->tree_root;
471 : u8 uuid_tmp[BTRFS_UUID_SIZE];
472 : struct btrfs_trans_handle *trans;
473 : int ret = 0;
474 :
475 : /* don't allow cancel or unmount to disturb the finishing procedure */
476 8 : mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
477 :
478 8 : btrfs_dev_replace_lock(dev_replace);
479 : /* was the operation canceled, or is it finished? */
480 8 : if (dev_replace->replace_state !=
481 : BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) {
482 1 : btrfs_dev_replace_unlock(dev_replace);
483 1 : mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
484 1 : return 0;
485 : }
486 :
487 7 : tgt_device = dev_replace->tgtdev;
488 7 : src_device = dev_replace->srcdev;
489 7 : btrfs_dev_replace_unlock(dev_replace);
490 :
491 : /*
492 : * flush all outstanding I/O and inode extent mappings before the
493 : * copy operation is declared as being finished
494 : */
495 7 : ret = btrfs_start_delalloc_roots(root->fs_info, 0, -1);
496 7 : if (ret) {
497 0 : mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
498 0 : return ret;
499 : }
500 7 : btrfs_wait_ordered_roots(root->fs_info, -1);
501 :
502 7 : trans = btrfs_start_transaction(root, 0);
503 7 : if (IS_ERR(trans)) {
504 0 : mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
505 0 : return PTR_ERR(trans);
506 : }
507 7 : ret = btrfs_commit_transaction(trans, root);
508 7 : WARN_ON(ret);
509 :
510 : /* keep away write_all_supers() during the finishing procedure */
511 7 : mutex_lock(&root->fs_info->chunk_mutex);
512 7 : mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
513 7 : btrfs_dev_replace_lock(dev_replace);
514 7 : dev_replace->replace_state =
515 : scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
516 : : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED;
517 7 : dev_replace->tgtdev = NULL;
518 7 : dev_replace->srcdev = NULL;
519 7 : dev_replace->time_stopped = get_seconds();
520 7 : dev_replace->item_needs_writeback = 1;
521 :
522 : /* replace old device with new one in mapping tree */
523 7 : if (!scrub_ret) {
524 7 : btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
525 : src_device,
526 : tgt_device);
527 : } else {
528 0 : printk_in_rcu(KERN_ERR
529 : "BTRFS: btrfs_scrub_dev(%s, %llu, %s) failed %d\n",
530 : src_device->missing ? "<missing disk>" :
531 : rcu_str_deref(src_device->name),
532 : src_device->devid,
533 : rcu_str_deref(tgt_device->name), scrub_ret);
534 0 : btrfs_dev_replace_unlock(dev_replace);
535 0 : mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
536 0 : mutex_unlock(&root->fs_info->chunk_mutex);
537 0 : if (tgt_device)
538 0 : btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
539 0 : mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
540 :
541 0 : return 0;
542 : }
543 :
544 14 : printk_in_rcu(KERN_INFO
545 : "BTRFS: dev_replace from %s (devid %llu) to %s) finished\n",
546 : src_device->missing ? "<missing disk>" :
547 : rcu_str_deref(src_device->name),
548 : src_device->devid,
549 : rcu_str_deref(tgt_device->name));
550 7 : tgt_device->is_tgtdev_for_dev_replace = 0;
551 7 : tgt_device->devid = src_device->devid;
552 7 : src_device->devid = BTRFS_DEV_REPLACE_DEVID;
553 7 : tgt_device->bytes_used = src_device->bytes_used;
554 7 : memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp));
555 7 : memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid));
556 7 : memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid));
557 7 : tgt_device->total_bytes = src_device->total_bytes;
558 7 : tgt_device->disk_total_bytes = src_device->disk_total_bytes;
559 7 : tgt_device->bytes_used = src_device->bytes_used;
560 7 : if (fs_info->sb->s_bdev == src_device->bdev)
561 0 : fs_info->sb->s_bdev = tgt_device->bdev;
562 7 : if (fs_info->fs_devices->latest_bdev == src_device->bdev)
563 6 : fs_info->fs_devices->latest_bdev = tgt_device->bdev;
564 7 : list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
565 :
566 : /* replace the sysfs entry */
567 7 : btrfs_kobj_rm_device(fs_info, src_device);
568 7 : btrfs_kobj_add_device(fs_info, tgt_device);
569 :
570 7 : btrfs_rm_dev_replace_blocked(fs_info);
571 :
572 7 : btrfs_rm_dev_replace_srcdev(fs_info, src_device);
573 :
574 7 : btrfs_rm_dev_replace_unblocked(fs_info);
575 :
576 : /*
577 : * this is again a consistent state where no dev_replace procedure
578 : * is running, the target device is part of the filesystem, the
579 : * source device is not part of the filesystem anymore and its 1st
580 : * superblock is scratched out so that it is no longer marked to
581 : * belong to this filesystem.
582 : */
583 7 : btrfs_dev_replace_unlock(dev_replace);
584 7 : mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
585 7 : mutex_unlock(&root->fs_info->chunk_mutex);
586 :
587 : /* write back the superblocks */
588 7 : trans = btrfs_start_transaction(root, 0);
589 7 : if (!IS_ERR(trans))
590 7 : btrfs_commit_transaction(trans, root);
591 :
592 7 : mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
593 :
594 7 : return 0;
595 : }
596 :
597 7 : static void btrfs_dev_replace_update_device_in_mapping_tree(
598 : struct btrfs_fs_info *fs_info,
599 : struct btrfs_device *srcdev,
600 : struct btrfs_device *tgtdev)
601 : {
602 7 : struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
603 : struct extent_map *em;
604 : struct map_lookup *map;
605 : u64 start = 0;
606 : int i;
607 :
608 7 : write_lock(&em_tree->lock);
609 : do {
610 47 : em = lookup_extent_mapping(em_tree, start, (u64)-1);
611 47 : if (!em)
612 : break;
613 40 : map = (struct map_lookup *)em->bdev;
614 97 : for (i = 0; i < map->num_stripes; i++)
615 57 : if (srcdev == map->stripes[i].dev)
616 38 : map->stripes[i].dev = tgtdev;
617 40 : start = em->start + em->len;
618 40 : free_extent_map(em);
619 40 : } while (start);
620 : write_unlock(&em_tree->lock);
621 7 : }
622 :
623 8 : static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid,
624 : char *srcdev_name,
625 : struct btrfs_device **device)
626 : {
627 : int ret;
628 :
629 8 : if (srcdevid) {
630 : ret = 0;
631 0 : *device = btrfs_find_device(root->fs_info, srcdevid, NULL,
632 : NULL);
633 0 : if (!*device)
634 : ret = -ENOENT;
635 : } else {
636 8 : ret = btrfs_find_device_missing_or_by_path(root, srcdev_name,
637 : device);
638 : }
639 8 : return ret;
640 : }
641 :
642 16 : void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
643 : struct btrfs_ioctl_dev_replace_args *args)
644 : {
645 16 : struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
646 :
647 16 : btrfs_dev_replace_lock(dev_replace);
648 : /* even if !dev_replace_is_valid, the values are good enough for
649 : * the replace_status ioctl */
650 16 : args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
651 16 : args->status.replace_state = dev_replace->replace_state;
652 16 : args->status.time_started = dev_replace->time_started;
653 16 : args->status.time_stopped = dev_replace->time_stopped;
654 16 : args->status.num_write_errors =
655 : atomic64_read(&dev_replace->num_write_errors);
656 16 : args->status.num_uncorrectable_read_errors =
657 : atomic64_read(&dev_replace->num_uncorrectable_read_errors);
658 16 : switch (dev_replace->replace_state) {
659 : case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
660 : case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
661 8 : args->status.progress_1000 = 0;
662 8 : break;
663 : case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
664 8 : args->status.progress_1000 = 1000;
665 8 : break;
666 : case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
667 : case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
668 0 : args->status.progress_1000 = div64_u64(dev_replace->cursor_left,
669 0 : div64_u64(dev_replace->srcdev->total_bytes, 1000));
670 0 : break;
671 : }
672 16 : btrfs_dev_replace_unlock(dev_replace);
673 16 : }
674 :
675 1 : int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
676 : struct btrfs_ioctl_dev_replace_args *args)
677 : {
678 1 : args->result = __btrfs_dev_replace_cancel(fs_info);
679 1 : return 0;
680 : }
681 :
682 1 : static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
683 : {
684 1 : struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
685 : struct btrfs_device *tgt_device = NULL;
686 : struct btrfs_trans_handle *trans;
687 1 : struct btrfs_root *root = fs_info->tree_root;
688 : u64 result;
689 : int ret;
690 :
691 1 : if (fs_info->sb->s_flags & MS_RDONLY)
692 : return -EROFS;
693 :
694 1 : mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
695 1 : btrfs_dev_replace_lock(dev_replace);
696 1 : switch (dev_replace->replace_state) {
697 : case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
698 : case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
699 : case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
700 : result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
701 0 : btrfs_dev_replace_unlock(dev_replace);
702 0 : goto leave;
703 : case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
704 : case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
705 : result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
706 1 : tgt_device = dev_replace->tgtdev;
707 1 : dev_replace->tgtdev = NULL;
708 1 : dev_replace->srcdev = NULL;
709 1 : break;
710 : }
711 1 : dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
712 1 : dev_replace->time_stopped = get_seconds();
713 1 : dev_replace->item_needs_writeback = 1;
714 1 : btrfs_dev_replace_unlock(dev_replace);
715 1 : btrfs_scrub_cancel(fs_info);
716 :
717 1 : trans = btrfs_start_transaction(root, 0);
718 1 : if (IS_ERR(trans)) {
719 0 : mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
720 0 : return PTR_ERR(trans);
721 : }
722 1 : ret = btrfs_commit_transaction(trans, root);
723 1 : WARN_ON(ret);
724 1 : if (tgt_device)
725 1 : btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
726 :
727 : leave:
728 1 : mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
729 1 : return result;
730 : }
731 :
732 222 : void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
733 : {
734 222 : struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
735 :
736 222 : mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
737 222 : btrfs_dev_replace_lock(dev_replace);
738 222 : switch (dev_replace->replace_state) {
739 : case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
740 : case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
741 : case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
742 : case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
743 : break;
744 : case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
745 0 : dev_replace->replace_state =
746 : BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
747 0 : dev_replace->time_stopped = get_seconds();
748 0 : dev_replace->item_needs_writeback = 1;
749 0 : btrfs_info(fs_info, "suspending dev_replace for unmount");
750 0 : break;
751 : }
752 :
753 222 : btrfs_dev_replace_unlock(dev_replace);
754 222 : mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
755 222 : }
756 :
757 : /* resume dev_replace procedure that was interrupted by unmount */
758 194 : int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
759 : {
760 : struct task_struct *task;
761 194 : struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
762 :
763 194 : btrfs_dev_replace_lock(dev_replace);
764 194 : switch (dev_replace->replace_state) {
765 : case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
766 : case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
767 : case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
768 194 : btrfs_dev_replace_unlock(dev_replace);
769 194 : return 0;
770 : case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
771 : break;
772 : case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
773 0 : dev_replace->replace_state =
774 : BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
775 0 : break;
776 : }
777 0 : if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) {
778 0 : btrfs_info(fs_info, "cannot continue dev_replace, tgtdev is missing");
779 0 : btrfs_info(fs_info,
780 : "you may cancel the operation after 'mount -o degraded'");
781 0 : btrfs_dev_replace_unlock(dev_replace);
782 0 : return 0;
783 : }
784 0 : btrfs_dev_replace_unlock(dev_replace);
785 :
786 0 : WARN_ON(atomic_xchg(
787 : &fs_info->mutually_exclusive_operation_running, 1));
788 0 : task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl");
789 0 : return PTR_ERR_OR_ZERO(task);
790 : }
791 :
792 0 : static int btrfs_dev_replace_kthread(void *data)
793 : {
794 : struct btrfs_fs_info *fs_info = data;
795 : struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
796 : struct btrfs_ioctl_dev_replace_args *status_args;
797 : u64 progress;
798 :
799 0 : status_args = kzalloc(sizeof(*status_args), GFP_NOFS);
800 0 : if (status_args) {
801 0 : btrfs_dev_replace_status(fs_info, status_args);
802 0 : progress = status_args->status.progress_1000;
803 0 : kfree(status_args);
804 0 : do_div(progress, 10);
805 0 : printk_in_rcu(KERN_INFO
806 : "BTRFS: continuing dev_replace from %s (devid %llu) to %s @%u%%\n",
807 : dev_replace->srcdev->missing ? "<missing disk>" :
808 : rcu_str_deref(dev_replace->srcdev->name),
809 : dev_replace->srcdev->devid,
810 : dev_replace->tgtdev ?
811 : rcu_str_deref(dev_replace->tgtdev->name) :
812 : "<missing target disk>",
813 : (unsigned int)progress);
814 : }
815 0 : btrfs_dev_replace_continue_on_mount(fs_info);
816 : atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
817 :
818 0 : return 0;
819 : }
820 :
821 0 : static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info)
822 : {
823 : struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
824 : int ret;
825 :
826 0 : ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid,
827 : dev_replace->committed_cursor_left,
828 0 : dev_replace->srcdev->total_bytes,
829 : &dev_replace->scrub_progress, 0, 1);
830 0 : ret = btrfs_dev_replace_finishing(fs_info, ret);
831 0 : WARN_ON(ret);
832 0 : return 0;
833 : }
834 :
835 253844 : int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
836 : {
837 253844 : if (!dev_replace->is_valid)
838 : return 0;
839 :
840 113894 : switch (dev_replace->replace_state) {
841 : case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
842 : case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
843 : case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
844 : return 0;
845 : case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
846 : case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
847 : /*
848 : * return true even if tgtdev is missing (this is
849 : * something that can happen if the dev_replace
850 : * procedure is suspended by an umount and then
851 : * the tgtdev is missing (or "btrfs dev scan") was
852 : * not called and the the filesystem is remounted
853 : * in degraded state. This does not stop the
854 : * dev_replace procedure. It needs to be canceled
855 : * manually if the cancelation is wanted.
856 : */
857 : break;
858 : }
859 94595 : return 1;
860 : }
861 :
862 256422 : void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace)
863 : {
864 : /* the beginning is just an optimization for the typical case */
865 256422 : if (atomic_read(&dev_replace->nesting_level) == 0) {
866 : acquire_lock:
867 : /* this is not a nested case where the same thread
868 : * is trying to acqurire the same lock twice */
869 256422 : mutex_lock(&dev_replace->lock);
870 256438 : mutex_lock(&dev_replace->lock_management_lock);
871 256439 : dev_replace->lock_owner = current->pid;
872 256439 : atomic_inc(&dev_replace->nesting_level);
873 256439 : mutex_unlock(&dev_replace->lock_management_lock);
874 256439 : return;
875 : }
876 :
877 39 : mutex_lock(&dev_replace->lock_management_lock);
878 67 : if (atomic_read(&dev_replace->nesting_level) > 0 &&
879 56 : dev_replace->lock_owner == current->pid) {
880 0 : WARN_ON(!mutex_is_locked(&dev_replace->lock));
881 0 : atomic_inc(&dev_replace->nesting_level);
882 0 : mutex_unlock(&dev_replace->lock_management_lock);
883 0 : return;
884 : }
885 :
886 39 : mutex_unlock(&dev_replace->lock_management_lock);
887 39 : goto acquire_lock;
888 : }
889 :
890 256438 : void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace)
891 : {
892 256438 : WARN_ON(!mutex_is_locked(&dev_replace->lock));
893 256438 : mutex_lock(&dev_replace->lock_management_lock);
894 256439 : WARN_ON(atomic_read(&dev_replace->nesting_level) < 1);
895 512876 : WARN_ON(dev_replace->lock_owner != current->pid);
896 256438 : atomic_dec(&dev_replace->nesting_level);
897 256439 : if (atomic_read(&dev_replace->nesting_level) == 0) {
898 256439 : dev_replace->lock_owner = 0;
899 256439 : mutex_unlock(&dev_replace->lock_management_lock);
900 256439 : mutex_unlock(&dev_replace->lock);
901 : } else {
902 0 : mutex_unlock(&dev_replace->lock_management_lock);
903 : }
904 256438 : }
905 :
906 212908 : void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
907 : {
908 212908 : percpu_counter_inc(&fs_info->bio_counter);
909 212906 : }
910 :
911 360460 : void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info)
912 : {
913 360460 : percpu_counter_dec(&fs_info->bio_counter);
914 :
915 360467 : if (waitqueue_active(&fs_info->replace_wait))
916 0 : wake_up(&fs_info->replace_wait);
917 360467 : }
918 :
919 147582 : void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info)
920 : {
921 295164 : DEFINE_WAIT(wait);
922 : again:
923 147582 : percpu_counter_inc(&fs_info->bio_counter);
924 147588 : if (test_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state)) {
925 0 : btrfs_bio_counter_dec(fs_info);
926 0 : wait_event(fs_info->replace_wait,
927 : !test_bit(BTRFS_FS_STATE_DEV_REPLACING,
928 : &fs_info->fs_state));
929 : goto again;
930 : }
931 :
932 147588 : }
|