Line data Source code
1 : /*
2 : * Copyright (C) 2007 Oracle. All rights reserved.
3 : *
4 : * This program is free software; you can redistribute it and/or
5 : * modify it under the terms of the GNU General Public
6 : * License v2 as published by the Free Software Foundation.
7 : *
8 : * This program is distributed in the hope that it will be useful,
9 : * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 : * General Public License for more details.
12 : *
13 : * You should have received a copy of the GNU General Public
14 : * License along with this program; if not, write to the
15 : * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 : * Boston, MA 021110-1307, USA.
17 : */
18 :
19 : #include <linux/slab.h>
20 : #include <linux/blkdev.h>
21 : #include <linux/writeback.h>
22 : #include <linux/pagevec.h>
23 : #include "ctree.h"
24 : #include "transaction.h"
25 : #include "btrfs_inode.h"
26 : #include "extent_io.h"
27 : #include "disk-io.h"
28 :
29 : static struct kmem_cache *btrfs_ordered_extent_cache;
30 :
31 : static u64 entry_end(struct btrfs_ordered_extent *entry)
32 : {
33 257359 : if (entry->file_offset + entry->len < entry->file_offset)
34 : return (u64)-1;
35 : return entry->file_offset + entry->len;
36 : }
37 :
38 : /* returns NULL if the insertion worked, or it returns the node it did find
39 : * in the tree
40 : */
41 51501 : static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
42 : struct rb_node *node)
43 : {
44 51501 : struct rb_node **p = &root->rb_node;
45 : struct rb_node *parent = NULL;
46 70981 : struct btrfs_ordered_extent *entry;
47 :
48 175100 : while (*p) {
49 : parent = *p;
50 : entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node);
51 :
52 72097 : if (file_offset < entry->file_offset)
53 1116 : p = &(*p)->rb_left;
54 70981 : else if (file_offset >= entry_end(entry))
55 70982 : p = &(*p)->rb_right;
56 : else
57 : return parent;
58 : }
59 :
60 : rb_link_node(node, parent, p);
61 51502 : rb_insert_color(node, root);
62 51500 : return NULL;
63 : }
64 :
65 0 : static void ordered_data_tree_panic(struct inode *inode, int errno,
66 : u64 offset)
67 : {
68 0 : struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
69 0 : btrfs_panic(fs_info, errno, "Inconsistency in ordered tree at offset "
70 : "%llu", offset);
71 : }
72 :
73 : /*
74 : * look for a given offset in the tree, and if it can't be found return the
75 : * first lesser offset
76 : */
77 2178179 : static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
78 : struct rb_node **prev_ret)
79 : {
80 2178179 : struct rb_node *n = root->rb_node;
81 : struct rb_node *prev = NULL;
82 : struct rb_node *test;
83 124452 : struct btrfs_ordered_extent *entry;
84 10174 : struct btrfs_ordered_extent *prev_entry = NULL;
85 :
86 4446273 : while (n) {
87 152192 : entry = rb_entry(n, struct btrfs_ordered_extent, rb_node);
88 : prev = n;
89 : prev_entry = entry;
90 :
91 152192 : if (file_offset < entry->file_offset)
92 27740 : n = n->rb_left;
93 124452 : else if (file_offset >= entry_end(entry))
94 62175 : n = n->rb_right;
95 : else
96 : return n;
97 : }
98 2115902 : if (!prev_ret)
99 : return NULL;
100 :
101 2120983 : while (prev && file_offset >= entry_end(prev_entry)) {
102 5008 : test = rb_next(prev);
103 5008 : if (!test)
104 : break;
105 2 : prev_entry = rb_entry(test, struct btrfs_ordered_extent,
106 : rb_node);
107 2 : if (file_offset < entry_end(prev_entry))
108 : break;
109 :
110 : prev = test;
111 : }
112 2115905 : if (prev)
113 5079 : prev_entry = rb_entry(prev, struct btrfs_ordered_extent,
114 : rb_node);
115 2121014 : while (prev && file_offset < entry_end(prev_entry)) {
116 71 : test = rb_prev(prev);
117 71 : if (!test)
118 : break;
119 15 : prev_entry = rb_entry(test, struct btrfs_ordered_extent,
120 : rb_node);
121 : prev = test;
122 : }
123 2115905 : *prev_ret = prev;
124 2115905 : return NULL;
125 : }
126 :
127 : /*
128 : * helper to check if a given offset is inside a given entry
129 : */
130 : static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
131 : {
132 5626158 : if (file_offset < entry->file_offset ||
133 2811806 : entry->file_offset + entry->len <= file_offset)
134 : return 0;
135 : return 1;
136 : }
137 :
138 : static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
139 : u64 len)
140 : {
141 4418 : if (file_offset + len <= entry->file_offset ||
142 2203 : entry->file_offset + entry->len <= file_offset)
143 : return 0;
144 : return 1;
145 : }
146 :
147 : /*
148 : * look find the first ordered struct that has this offset, otherwise
149 : * the first one less than this offset
150 : */
151 3540964 : static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
152 : u64 file_offset)
153 : {
154 3540964 : struct rb_root *root = &tree->tree;
155 3540964 : struct rb_node *prev = NULL;
156 : struct rb_node *ret;
157 1389046 : struct btrfs_ordered_extent *entry;
158 :
159 3540964 : if (tree->last) {
160 : entry = rb_entry(tree->last, struct btrfs_ordered_extent,
161 : rb_node);
162 1389046 : if (offset_in_entry(entry, file_offset))
163 : return tree->last;
164 : }
165 2178117 : ret = __tree_search(root, file_offset, &prev);
166 2178175 : if (!ret)
167 2115908 : ret = prev;
168 2178175 : if (ret)
169 67358 : tree->last = ret;
170 2178175 : return ret;
171 : }
172 :
173 : /* allocate and add a new ordered_extent into the per-inode tree.
174 : * file_offset is the logical offset in the file
175 : *
176 : * start is the disk block number of an extent already reserved in the
177 : * extent allocation tree
178 : *
179 : * len is the length of the extent
180 : *
181 : * The tree is given a single reference on the ordered extent that was
182 : * inserted.
183 : */
184 51494 : static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
185 : u64 start, u64 len, u64 disk_len,
186 : int type, int dio, int compress_type)
187 : {
188 51494 : struct btrfs_root *root = BTRFS_I(inode)->root;
189 : struct btrfs_ordered_inode_tree *tree;
190 : struct rb_node *node;
191 : struct btrfs_ordered_extent *entry;
192 :
193 : tree = &BTRFS_I(inode)->ordered_tree;
194 51494 : entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS);
195 51497 : if (!entry)
196 : return -ENOMEM;
197 :
198 51498 : entry->file_offset = file_offset;
199 51498 : entry->start = start;
200 51498 : entry->len = len;
201 51498 : if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) &&
202 : !(type == BTRFS_ORDERED_NOCOW))
203 47440 : entry->csum_bytes_left = disk_len;
204 51498 : entry->disk_len = disk_len;
205 51498 : entry->bytes_left = len;
206 51498 : entry->inode = igrab(inode);
207 51505 : entry->compress_type = compress_type;
208 51505 : entry->truncated_len = (u64)-1;
209 51505 : if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
210 5860 : set_bit(type, &entry->flags);
211 :
212 51503 : if (dio)
213 : set_bit(BTRFS_ORDERED_DIRECT, &entry->flags);
214 :
215 : /* one ref for the tree */
216 : atomic_set(&entry->refs, 1);
217 51503 : init_waitqueue_head(&entry->wait);
218 51500 : INIT_LIST_HEAD(&entry->list);
219 51500 : INIT_LIST_HEAD(&entry->root_extent_list);
220 51500 : INIT_LIST_HEAD(&entry->work_list);
221 : init_completion(&entry->completion);
222 51498 : INIT_LIST_HEAD(&entry->log_list);
223 :
224 51498 : trace_btrfs_ordered_extent_add(inode, entry);
225 :
226 : spin_lock_irq(&tree->lock);
227 51501 : node = tree_insert(&tree->tree, file_offset,
228 : &entry->rb_node);
229 51499 : if (node)
230 0 : ordered_data_tree_panic(inode, -EEXIST, file_offset);
231 : spin_unlock_irq(&tree->lock);
232 :
233 : spin_lock(&root->ordered_extent_lock);
234 51503 : list_add_tail(&entry->root_extent_list,
235 : &root->ordered_extents);
236 51503 : root->nr_ordered_extents++;
237 51503 : if (root->nr_ordered_extents == 1) {
238 30682 : spin_lock(&root->fs_info->ordered_root_lock);
239 61370 : BUG_ON(!list_empty(&root->ordered_root));
240 30685 : list_add_tail(&root->ordered_root,
241 30685 : &root->fs_info->ordered_roots);
242 30685 : spin_unlock(&root->fs_info->ordered_root_lock);
243 : }
244 : spin_unlock(&root->ordered_extent_lock);
245 :
246 51504 : return 0;
247 : }
248 :
249 26091 : int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
250 : u64 start, u64 len, u64 disk_len, int type)
251 : {
252 26091 : return __btrfs_add_ordered_extent(inode, file_offset, start, len,
253 : disk_len, type, 0,
254 : BTRFS_COMPRESS_NONE);
255 : }
256 :
257 25252 : int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
258 : u64 start, u64 len, u64 disk_len, int type)
259 : {
260 25252 : return __btrfs_add_ordered_extent(inode, file_offset, start, len,
261 : disk_len, type, 1,
262 : BTRFS_COMPRESS_NONE);
263 : }
264 :
265 153 : int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
266 : u64 start, u64 len, u64 disk_len,
267 : int type, int compress_type)
268 : {
269 153 : return __btrfs_add_ordered_extent(inode, file_offset, start, len,
270 : disk_len, type, 0,
271 : compress_type);
272 : }
273 :
274 : /*
275 : * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
276 : * when an ordered extent is finished. If the list covers more than one
277 : * ordered extent, it is split across multiples.
278 : */
279 90839 : void btrfs_add_ordered_sum(struct inode *inode,
280 : struct btrfs_ordered_extent *entry,
281 : struct btrfs_ordered_sum *sum)
282 : {
283 : struct btrfs_ordered_inode_tree *tree;
284 :
285 : tree = &BTRFS_I(inode)->ordered_tree;
286 : spin_lock_irq(&tree->lock);
287 90884 : list_add_tail(&sum->list, &entry->list);
288 90883 : WARN_ON(entry->csum_bytes_left < sum->len);
289 90882 : entry->csum_bytes_left -= sum->len;
290 90882 : if (entry->csum_bytes_left == 0)
291 47444 : wake_up(&entry->wait);
292 : spin_unlock_irq(&tree->lock);
293 90876 : }
294 :
295 : /*
296 : * this is used to account for finished IO across a given range
297 : * of the file. The IO may span ordered extents. If
298 : * a given ordered_extent is completely done, 1 is returned, otherwise
299 : * 0.
300 : *
301 : * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
302 : * to make sure this function only returns 1 once for a given ordered extent.
303 : *
304 : * file_offset is updated to one byte past the range that is recorded as
305 : * complete. This allows you to walk forward in the file.
306 : */
307 25260 : int btrfs_dec_test_first_ordered_pending(struct inode *inode,
308 : struct btrfs_ordered_extent **cached,
309 : u64 *file_offset, u64 io_size, int uptodate)
310 : {
311 : struct btrfs_ordered_inode_tree *tree;
312 : struct rb_node *node;
313 25260 : struct btrfs_ordered_extent *entry = NULL;
314 : int ret;
315 : unsigned long flags;
316 : u64 dec_end;
317 : u64 dec_start;
318 : u64 to_dec;
319 :
320 25260 : tree = &BTRFS_I(inode)->ordered_tree;
321 25260 : spin_lock_irqsave(&tree->lock, flags);
322 25260 : node = tree_search(tree, *file_offset);
323 25260 : if (!node) {
324 : ret = 1;
325 : goto out;
326 : }
327 :
328 25260 : entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
329 50520 : if (!offset_in_entry(entry, *file_offset)) {
330 : ret = 1;
331 : goto out;
332 : }
333 :
334 25260 : dec_start = max(*file_offset, entry->file_offset);
335 25260 : dec_end = min(*file_offset + io_size, entry->file_offset +
336 : entry->len);
337 25260 : *file_offset = dec_end;
338 25260 : if (dec_start > dec_end) {
339 0 : btrfs_crit(BTRFS_I(inode)->root->fs_info,
340 : "bad ordering dec_start %llu end %llu", dec_start, dec_end);
341 : }
342 25258 : to_dec = dec_end - dec_start;
343 25258 : if (to_dec > entry->bytes_left) {
344 0 : btrfs_crit(BTRFS_I(inode)->root->fs_info,
345 : "bad ordered accounting left %llu size %llu",
346 : entry->bytes_left, to_dec);
347 : }
348 25258 : entry->bytes_left -= to_dec;
349 25258 : if (!uptodate)
350 : set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
351 :
352 25258 : if (entry->bytes_left == 0) {
353 25259 : ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
354 25260 : if (waitqueue_active(&entry->wait))
355 0 : wake_up(&entry->wait);
356 : } else {
357 : ret = 1;
358 : }
359 : out:
360 25259 : if (!ret && cached && entry) {
361 25259 : *cached = entry;
362 25259 : atomic_inc(&entry->refs);
363 : }
364 : spin_unlock_irqrestore(&tree->lock, flags);
365 25260 : return ret == 0;
366 : }
367 :
368 : /*
369 : * this is used to account for finished IO across a given range
370 : * of the file. The IO should not span ordered extents. If
371 : * a given ordered_extent is completely done, 1 is returned, otherwise
372 : * 0.
373 : *
374 : * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
375 : * to make sure this function only returns 1 once for a given ordered extent.
376 : */
377 1309063 : int btrfs_dec_test_ordered_pending(struct inode *inode,
378 : struct btrfs_ordered_extent **cached,
379 : u64 file_offset, u64 io_size, int uptodate)
380 : {
381 : struct btrfs_ordered_inode_tree *tree;
382 : struct rb_node *node;
383 1309068 : struct btrfs_ordered_extent *entry = NULL;
384 : unsigned long flags;
385 : int ret;
386 :
387 1309063 : tree = &BTRFS_I(inode)->ordered_tree;
388 1309063 : spin_lock_irqsave(&tree->lock, flags);
389 1309071 : if (cached && *cached) {
390 : entry = *cached;
391 : goto have_entry;
392 : }
393 :
394 1309068 : node = tree_search(tree, file_offset);
395 1309065 : if (!node) {
396 : ret = 1;
397 : goto out;
398 : }
399 :
400 1309065 : entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
401 : have_entry:
402 1309068 : if (!offset_in_entry(entry, file_offset)) {
403 : ret = 1;
404 : goto out;
405 : }
406 :
407 1309068 : if (io_size > entry->bytes_left) {
408 0 : btrfs_crit(BTRFS_I(inode)->root->fs_info,
409 : "bad ordered accounting left %llu size %llu",
410 : entry->bytes_left, io_size);
411 : }
412 1309069 : entry->bytes_left -= io_size;
413 1309069 : if (!uptodate)
414 : set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
415 :
416 1309069 : if (entry->bytes_left == 0) {
417 26245 : ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
418 26245 : if (waitqueue_active(&entry->wait))
419 152 : wake_up(&entry->wait);
420 : } else {
421 : ret = 1;
422 : }
423 : out:
424 1309069 : if (!ret && cached && entry) {
425 26245 : *cached = entry;
426 26245 : atomic_inc(&entry->refs);
427 : }
428 : spin_unlock_irqrestore(&tree->lock, flags);
429 1309066 : return ret == 0;
430 : }
431 :
432 : /* Needs to either be called under a log transaction or the log_mutex */
433 1824 : void btrfs_get_logged_extents(struct inode *inode,
434 : struct list_head *logged_list)
435 : {
436 : struct btrfs_ordered_inode_tree *tree;
437 : struct btrfs_ordered_extent *ordered;
438 : struct rb_node *n;
439 :
440 : tree = &BTRFS_I(inode)->ordered_tree;
441 : spin_lock_irq(&tree->lock);
442 2999 : for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
443 : ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
444 2350 : if (!list_empty(&ordered->log_list))
445 0 : continue;
446 : list_add_tail(&ordered->log_list, logged_list);
447 1175 : atomic_inc(&ordered->refs);
448 : }
449 : spin_unlock_irq(&tree->lock);
450 1824 : }
451 :
452 0 : void btrfs_put_logged_extents(struct list_head *logged_list)
453 : {
454 : struct btrfs_ordered_extent *ordered;
455 :
456 0 : while (!list_empty(logged_list)) {
457 0 : ordered = list_first_entry(logged_list,
458 : struct btrfs_ordered_extent,
459 : log_list);
460 0 : list_del_init(&ordered->log_list);
461 0 : btrfs_put_ordered_extent(ordered);
462 : }
463 0 : }
464 :
465 1824 : void btrfs_submit_logged_extents(struct list_head *logged_list,
466 : struct btrfs_root *log)
467 : {
468 1824 : int index = log->log_transid % 2;
469 :
470 1824 : spin_lock_irq(&log->log_extents_lock[index]);
471 1824 : list_splice_tail(logged_list, &log->logged_list[index]);
472 : spin_unlock_irq(&log->log_extents_lock[index]);
473 1824 : }
474 :
475 1477 : void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid)
476 : {
477 : struct btrfs_ordered_extent *ordered;
478 1477 : int index = transid % 2;
479 :
480 1477 : spin_lock_irq(&log->log_extents_lock[index]);
481 5304 : while (!list_empty(&log->logged_list[index])) {
482 1175 : ordered = list_first_entry(&log->logged_list[index],
483 : struct btrfs_ordered_extent,
484 : log_list);
485 1175 : list_del_init(&ordered->log_list);
486 : spin_unlock_irq(&log->log_extents_lock[index]);
487 :
488 1175 : if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) &&
489 : !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) {
490 0 : struct inode *inode = ordered->inode;
491 0 : u64 start = ordered->file_offset;
492 0 : u64 end = ordered->file_offset + ordered->len - 1;
493 :
494 0 : WARN_ON(!inode);
495 0 : filemap_fdatawrite_range(inode->i_mapping, start, end);
496 : }
497 1175 : wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE,
498 : &ordered->flags));
499 :
500 1175 : btrfs_put_ordered_extent(ordered);
501 : spin_lock_irq(&log->log_extents_lock[index]);
502 : }
503 : spin_unlock_irq(&log->log_extents_lock[index]);
504 1477 : }
505 :
506 898 : void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid)
507 : {
508 : struct btrfs_ordered_extent *ordered;
509 898 : int index = transid % 2;
510 :
511 898 : spin_lock_irq(&log->log_extents_lock[index]);
512 1796 : while (!list_empty(&log->logged_list[index])) {
513 0 : ordered = list_first_entry(&log->logged_list[index],
514 : struct btrfs_ordered_extent,
515 : log_list);
516 0 : list_del_init(&ordered->log_list);
517 : spin_unlock_irq(&log->log_extents_lock[index]);
518 0 : btrfs_put_ordered_extent(ordered);
519 : spin_lock_irq(&log->log_extents_lock[index]);
520 : }
521 : spin_unlock_irq(&log->log_extents_lock[index]);
522 898 : }
523 :
524 : /*
525 : * used to drop a reference on an ordered extent. This will free
526 : * the extent if the last reference is dropped
527 : */
528 198897 : void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
529 : {
530 : struct list_head *cur;
531 : struct btrfs_ordered_sum *sum;
532 :
533 198897 : trace_btrfs_ordered_extent_put(entry->inode, entry);
534 :
535 397816 : if (atomic_dec_and_test(&entry->refs)) {
536 51505 : if (entry->inode)
537 51505 : btrfs_add_delayed_iput(entry->inode);
538 284776 : while (!list_empty(&entry->list)) {
539 : cur = entry->list.next;
540 90883 : sum = list_entry(cur, struct btrfs_ordered_sum, list);
541 90883 : list_del(&sum->list);
542 90883 : kfree(sum);
543 : }
544 51505 : kmem_cache_free(btrfs_ordered_extent_cache, entry);
545 : }
546 198925 : }
547 :
548 : /*
549 : * remove an ordered extent from the tree. No references are dropped
550 : * and waiters are woken up.
551 : */
552 51505 : void btrfs_remove_ordered_extent(struct inode *inode,
553 : struct btrfs_ordered_extent *entry)
554 : {
555 : struct btrfs_ordered_inode_tree *tree;
556 51505 : struct btrfs_root *root = BTRFS_I(inode)->root;
557 : struct rb_node *node;
558 :
559 : tree = &BTRFS_I(inode)->ordered_tree;
560 : spin_lock_irq(&tree->lock);
561 51505 : node = &entry->rb_node;
562 51505 : rb_erase(node, &tree->tree);
563 51505 : if (tree->last == node)
564 41164 : tree->last = NULL;
565 : set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
566 : spin_unlock_irq(&tree->lock);
567 :
568 : spin_lock(&root->ordered_extent_lock);
569 51505 : list_del_init(&entry->root_extent_list);
570 51505 : root->nr_ordered_extents--;
571 :
572 51505 : trace_btrfs_ordered_extent_remove(inode, entry);
573 :
574 51505 : if (!root->nr_ordered_extents) {
575 30685 : spin_lock(&root->fs_info->ordered_root_lock);
576 61370 : BUG_ON(list_empty(&root->ordered_root));
577 : list_del_init(&root->ordered_root);
578 30685 : spin_unlock(&root->fs_info->ordered_root_lock);
579 : }
580 : spin_unlock(&root->ordered_extent_lock);
581 51505 : wake_up(&entry->wait);
582 51504 : }
583 :
584 1053 : static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
585 : {
586 : struct btrfs_ordered_extent *ordered;
587 :
588 1053 : ordered = container_of(work, struct btrfs_ordered_extent, flush_work);
589 1053 : btrfs_start_ordered_extent(ordered->inode, ordered, 1);
590 1049 : complete(&ordered->completion);
591 1052 : }
592 :
593 : /*
594 : * wait for all the ordered extents in a root. This is done when balancing
595 : * space between drives.
596 : */
597 193 : int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
598 : {
599 : struct list_head splice, works;
600 : struct btrfs_ordered_extent *ordered, *next;
601 : int count = 0;
602 :
603 : INIT_LIST_HEAD(&splice);
604 : INIT_LIST_HEAD(&works);
605 :
606 193 : mutex_lock(&root->ordered_extent_mutex);
607 : spin_lock(&root->ordered_extent_lock);
608 193 : list_splice_init(&root->ordered_extents, &splice);
609 1246 : while (!list_empty(&splice) && nr) {
610 : ordered = list_first_entry(&splice, struct btrfs_ordered_extent,
611 : root_extent_list);
612 1053 : list_move_tail(&ordered->root_extent_list,
613 : &root->ordered_extents);
614 1053 : atomic_inc(&ordered->refs);
615 : spin_unlock(&root->ordered_extent_lock);
616 :
617 1053 : btrfs_init_work(&ordered->flush_work,
618 : btrfs_flush_delalloc_helper,
619 : btrfs_run_ordered_extent_work, NULL, NULL);
620 1053 : list_add_tail(&ordered->work_list, &works);
621 1053 : btrfs_queue_work(root->fs_info->flush_workers,
622 : &ordered->flush_work);
623 :
624 1053 : cond_resched();
625 : spin_lock(&root->ordered_extent_lock);
626 1053 : if (nr != -1)
627 14 : nr--;
628 1053 : count++;
629 : }
630 : list_splice_tail(&splice, &root->ordered_extents);
631 : spin_unlock(&root->ordered_extent_lock);
632 :
633 1246 : list_for_each_entry_safe(ordered, next, &works, work_list) {
634 : list_del_init(&ordered->work_list);
635 1053 : wait_for_completion(&ordered->completion);
636 1053 : btrfs_put_ordered_extent(ordered);
637 1053 : cond_resched();
638 : }
639 193 : mutex_unlock(&root->ordered_extent_mutex);
640 :
641 193 : return count;
642 : }
643 :
644 2492 : void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr)
645 : {
646 : struct btrfs_root *root;
647 : struct list_head splice;
648 : int done;
649 :
650 : INIT_LIST_HEAD(&splice);
651 :
652 2492 : mutex_lock(&fs_info->ordered_operations_mutex);
653 : spin_lock(&fs_info->ordered_root_lock);
654 2493 : list_splice_init(&fs_info->ordered_roots, &splice);
655 2540 : while (!list_empty(&splice) && nr) {
656 47 : root = list_first_entry(&splice, struct btrfs_root,
657 : ordered_root);
658 47 : root = btrfs_grab_fs_root(root);
659 47 : BUG_ON(!root);
660 47 : list_move_tail(&root->ordered_root,
661 : &fs_info->ordered_roots);
662 : spin_unlock(&fs_info->ordered_root_lock);
663 :
664 47 : done = btrfs_wait_ordered_extents(root, nr);
665 47 : btrfs_put_fs_root(root);
666 :
667 : spin_lock(&fs_info->ordered_root_lock);
668 47 : if (nr != -1) {
669 3 : nr -= done;
670 3 : WARN_ON(nr < 0);
671 : }
672 : }
673 : list_splice_tail(&splice, &fs_info->ordered_roots);
674 : spin_unlock(&fs_info->ordered_root_lock);
675 2493 : mutex_unlock(&fs_info->ordered_operations_mutex);
676 2493 : }
677 :
678 : /*
679 : * Used to start IO or wait for a given ordered extent to finish.
680 : *
681 : * If wait is one, this effectively waits on page writeback for all the pages
682 : * in the extent, and it waits on the io completion code to insert
683 : * metadata into the btree corresponding to the extent
684 : */
685 3875 : void btrfs_start_ordered_extent(struct inode *inode,
686 : struct btrfs_ordered_extent *entry,
687 : int wait)
688 : {
689 3875 : u64 start = entry->file_offset;
690 3875 : u64 end = start + entry->len - 1;
691 :
692 3875 : trace_btrfs_ordered_extent_start(inode, entry);
693 :
694 : /*
695 : * pages in the range can be dirty, clean or writeback. We
696 : * start IO on any dirty ones so the wait doesn't stall waiting
697 : * for the flusher thread to find them
698 : */
699 3875 : if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
700 3875 : filemap_fdatawrite_range(inode->i_mapping, start, end);
701 3875 : if (wait) {
702 14832 : wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
703 : &entry->flags));
704 : }
705 3874 : }
706 :
707 : /*
708 : * Used to wait on ordered extents across a large range of bytes.
709 : */
710 17638 : int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
711 : {
712 : int ret = 0;
713 : u64 end;
714 : u64 orig_end;
715 : struct btrfs_ordered_extent *ordered;
716 :
717 17638 : if (start + len < start) {
718 : orig_end = INT_LIMIT(loff_t);
719 : } else {
720 15421 : orig_end = start + len - 1;
721 15421 : if (orig_end > INT_LIMIT(loff_t))
722 : orig_end = INT_LIMIT(loff_t);
723 : }
724 :
725 : /* start IO across the range first to instantiate any delalloc
726 : * extents
727 : */
728 17638 : ret = filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
729 17637 : if (ret)
730 : return ret;
731 : /*
732 : * So with compression we will find and lock a dirty page and clear the
733 : * first one as dirty, setup an async extent, and immediately return
734 : * with the entire range locked but with nobody actually marked with
735 : * writeback. So we can't just filemap_write_and_wait_range() and
736 : * expect it to work since it will just kick off a thread to do the
737 : * actual work. So we need to call filemap_fdatawrite_range _again_
738 : * since it will wait on the page lock, which won't be unlocked until
739 : * after the pages have been marked as writeback and so we're good to go
740 : * from there. We have to do this otherwise we'll miss the ordered
741 : * extents and that results in badness. Please Josef, do not think you
742 : * know better and pull this out at some point in the future, it is
743 : * right and you are wrong.
744 : */
745 17637 : if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
746 : &BTRFS_I(inode)->runtime_flags)) {
747 90 : ret = filemap_fdatawrite_range(inode->i_mapping, start,
748 : orig_end);
749 90 : if (ret)
750 : return ret;
751 : }
752 17637 : ret = filemap_fdatawait_range(inode->i_mapping, start, orig_end);
753 17638 : if (ret)
754 : return ret;
755 :
756 : end = orig_end;
757 : while (1) {
758 17855 : ordered = btrfs_lookup_first_ordered_extent(inode, end);
759 17855 : if (!ordered)
760 : break;
761 2809 : if (ordered->file_offset > orig_end) {
762 0 : btrfs_put_ordered_extent(ordered);
763 0 : break;
764 : }
765 2809 : if (ordered->file_offset + ordered->len <= start) {
766 4 : btrfs_put_ordered_extent(ordered);
767 4 : break;
768 : }
769 2805 : btrfs_start_ordered_extent(inode, ordered, 1);
770 2805 : end = ordered->file_offset;
771 2805 : if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
772 : ret = -EIO;
773 2805 : btrfs_put_ordered_extent(ordered);
774 2805 : if (ret || end == 0 || end == start)
775 : break;
776 217 : end--;
777 217 : }
778 17638 : return ret;
779 : }
780 :
781 : /*
782 : * find an ordered extent corresponding to file_offset. return NULL if
783 : * nothing is found, otherwise take a reference on the extent and return it
784 : */
785 1919242 : struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
786 : u64 file_offset)
787 : {
788 : struct btrfs_ordered_inode_tree *tree;
789 : struct rb_node *node;
790 90972 : struct btrfs_ordered_extent *entry = NULL;
791 :
792 1919242 : tree = &BTRFS_I(inode)->ordered_tree;
793 : spin_lock_irq(&tree->lock);
794 1919295 : node = tree_search(tree, file_offset);
795 1919292 : if (!node)
796 : goto out;
797 :
798 90972 : entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
799 90972 : if (!offset_in_entry(entry, file_offset))
800 : entry = NULL;
801 90972 : if (entry)
802 90889 : atomic_inc(&entry->refs);
803 : out:
804 : spin_unlock_irq(&tree->lock);
805 1919296 : return entry;
806 : }
807 :
808 : /* Since the DIO code tries to lock a wide area we need to look for any ordered
809 : * extents that exist in the range, rather than just the start of the range.
810 : */
811 118648 : struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
812 : u64 file_offset,
813 : u64 len)
814 : {
815 : struct btrfs_ordered_inode_tree *tree;
816 : struct rb_node *node;
817 2215 : struct btrfs_ordered_extent *entry = NULL;
818 :
819 118648 : tree = &BTRFS_I(inode)->ordered_tree;
820 : spin_lock_irq(&tree->lock);
821 118653 : node = tree_search(tree, file_offset);
822 118652 : if (!node) {
823 116440 : node = tree_search(tree, file_offset + len);
824 116440 : if (!node)
825 : goto out;
826 : }
827 :
828 : while (1) {
829 2215 : entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
830 2215 : if (range_overlaps(entry, file_offset, len))
831 : break;
832 :
833 2207 : if (entry->file_offset >= file_offset + len) {
834 : entry = NULL;
835 : break;
836 : }
837 : entry = NULL;
838 2194 : node = rb_next(node);
839 2195 : if (!node)
840 : break;
841 : }
842 : out:
843 118653 : if (entry)
844 8 : atomic_inc(&entry->refs);
845 : spin_unlock_irq(&tree->lock);
846 118651 : return entry;
847 : }
848 :
849 : /*
850 : * lookup and return any extent before 'file_offset'. NULL is returned
851 : * if none is found
852 : */
853 : struct btrfs_ordered_extent *
854 46076 : btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
855 : {
856 : struct btrfs_ordered_inode_tree *tree;
857 : struct rb_node *node;
858 : struct btrfs_ordered_extent *entry = NULL;
859 :
860 46076 : tree = &BTRFS_I(inode)->ordered_tree;
861 : spin_lock_irq(&tree->lock);
862 46076 : node = tree_search(tree, file_offset);
863 46076 : if (!node)
864 : goto out;
865 :
866 2812 : entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
867 2812 : atomic_inc(&entry->refs);
868 : out:
869 : spin_unlock_irq(&tree->lock);
870 46076 : return entry;
871 : }
872 :
873 : /*
874 : * After an extent is done, call this to conditionally update the on disk
875 : * i_size. i_size is updated to cover any fully written part of the file.
876 : */
877 131334 : int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
878 51505 : struct btrfs_ordered_extent *ordered)
879 : {
880 65667 : struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
881 : u64 disk_i_size;
882 : u64 new_i_size;
883 65667 : u64 i_size = i_size_read(inode);
884 : struct rb_node *node;
885 : struct rb_node *prev = NULL;
886 253 : struct btrfs_ordered_extent *test;
887 : int ret = 1;
888 :
889 : spin_lock_irq(&tree->lock);
890 65667 : if (ordered) {
891 : offset = entry_end(ordered);
892 51505 : if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags))
893 0 : offset = min(offset,
894 : ordered->file_offset +
895 : ordered->truncated_len);
896 : } else {
897 14162 : offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize);
898 : }
899 65667 : disk_i_size = BTRFS_I(inode)->disk_i_size;
900 :
901 : /* truncate file */
902 65667 : if (disk_i_size > i_size) {
903 560 : BTRFS_I(inode)->disk_i_size = i_size;
904 : ret = 0;
905 560 : goto out;
906 : }
907 :
908 : /*
909 : * if the disk i_size is already at the inode->i_size, or
910 : * this ordered extent is inside the disk i_size, we're done
911 : */
912 65107 : if (disk_i_size == i_size)
913 : goto out;
914 :
915 : /*
916 : * We still need to update disk_i_size if outstanding_isize is greater
917 : * than disk_i_size.
918 : */
919 20819 : if (offset <= disk_i_size &&
920 79 : (!ordered || ordered->outstanding_isize <= disk_i_size))
921 : goto out;
922 :
923 : /*
924 : * walk backward from this ordered extent to disk_i_size.
925 : * if we find an ordered extent then we can't update disk i_size
926 : * yet
927 : */
928 20740 : if (ordered) {
929 14354 : node = rb_prev(&ordered->rb_node);
930 : } else {
931 6386 : prev = tree_search(tree, offset);
932 : /*
933 : * we insert file extents without involving ordered struct,
934 : * so there should be no ordered struct cover this offset
935 : */
936 6386 : if (prev) {
937 : test = rb_entry(prev, struct btrfs_ordered_extent,
938 : rb_node);
939 6 : BUG_ON(offset_in_entry(test, offset));
940 : }
941 : node = prev;
942 : }
943 24 : for (; node; node = rb_prev(node)) {
944 : test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
945 :
946 : /* We treat this entry as if it doesnt exist */
947 305 : if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags))
948 24 : continue;
949 281 : if (test->file_offset + test->len <= disk_i_size)
950 : break;
951 247 : if (test->file_offset >= i_size)
952 : break;
953 247 : if (entry_end(test) > disk_i_size) {
954 : /*
955 : * we don't update disk_i_size now, so record this
956 : * undealt i_size. Or we will not know the real
957 : * i_size.
958 : */
959 247 : if (test->outstanding_isize < offset)
960 247 : test->outstanding_isize = offset;
961 488 : if (ordered &&
962 241 : ordered->outstanding_isize >
963 241 : test->outstanding_isize)
964 15 : test->outstanding_isize =
965 : ordered->outstanding_isize;
966 : goto out;
967 : }
968 : }
969 20493 : new_i_size = min_t(u64, offset, i_size);
970 :
971 : /*
972 : * Some ordered extents may completed before the current one, and
973 : * we hold the real i_size in ->outstanding_isize.
974 : */
975 20493 : if (ordered && ordered->outstanding_isize > new_i_size)
976 180 : new_i_size = min_t(u64, ordered->outstanding_isize, i_size);
977 20493 : BTRFS_I(inode)->disk_i_size = new_i_size;
978 : ret = 0;
979 : out:
980 : /*
981 : * We need to do this because we can't remove ordered extents until
982 : * after the i_disk_size has been updated and then the inode has been
983 : * updated to reflect the change, so we need to tell anybody who finds
984 : * this ordered extent that we've already done all the real work, we
985 : * just haven't completed all the other work.
986 : */
987 65667 : if (ordered)
988 : set_bit(BTRFS_ORDERED_UPDATED_ISIZE, &ordered->flags);
989 : spin_unlock_irq(&tree->lock);
990 65667 : return ret;
991 : }
992 :
993 : /*
994 : * search the ordered extents for one corresponding to 'offset' and
995 : * try to find a checksum. This is used because we allow pages to
996 : * be reclaimed before their checksum is actually put into the btree
997 : */
998 26589 : int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
999 : u32 *sum, int len)
1000 : {
1001 : struct btrfs_ordered_sum *ordered_sum;
1002 : struct btrfs_ordered_extent *ordered;
1003 : struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
1004 : unsigned long num_sectors;
1005 : unsigned long i;
1006 26589 : u32 sectorsize = BTRFS_I(inode)->root->sectorsize;
1007 : int index = 0;
1008 :
1009 26589 : ordered = btrfs_lookup_ordered_extent(inode, offset);
1010 26589 : if (!ordered)
1011 : return 0;
1012 :
1013 : spin_lock_irq(&tree->lock);
1014 0 : list_for_each_entry_reverse(ordered_sum, &ordered->list, list) {
1015 0 : if (disk_bytenr >= ordered_sum->bytenr &&
1016 0 : disk_bytenr < ordered_sum->bytenr + ordered_sum->len) {
1017 0 : i = (disk_bytenr - ordered_sum->bytenr) >>
1018 0 : inode->i_sb->s_blocksize_bits;
1019 0 : num_sectors = ordered_sum->len >>
1020 : inode->i_sb->s_blocksize_bits;
1021 0 : num_sectors = min_t(int, len - index, num_sectors - i);
1022 0 : memcpy(sum + index, ordered_sum->sums + i,
1023 : num_sectors);
1024 :
1025 0 : index += (int)num_sectors;
1026 0 : if (index == len)
1027 : goto out;
1028 0 : disk_bytenr += num_sectors * sectorsize;
1029 : }
1030 : }
1031 : out:
1032 : spin_unlock_irq(&tree->lock);
1033 0 : btrfs_put_ordered_extent(ordered);
1034 0 : return index;
1035 : }
1036 :
1037 0 : int __init ordered_data_init(void)
1038 : {
1039 0 : btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",
1040 : sizeof(struct btrfs_ordered_extent), 0,
1041 : SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
1042 : NULL);
1043 0 : if (!btrfs_ordered_extent_cache)
1044 : return -ENOMEM;
1045 :
1046 0 : return 0;
1047 : }
1048 :
1049 0 : void ordered_data_exit(void)
1050 : {
1051 0 : if (btrfs_ordered_extent_cache)
1052 0 : kmem_cache_destroy(btrfs_ordered_extent_cache);
1053 0 : }
|