Line data Source code
1 : /*
2 : * Copyright (C) 2007 Oracle. All rights reserved.
3 : *
4 : * This program is free software; you can redistribute it and/or
5 : * modify it under the terms of the GNU General Public
6 : * License v2 as published by the Free Software Foundation.
7 : *
8 : * This program is distributed in the hope that it will be useful,
9 : * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 : * General Public License for more details.
12 : *
13 : * You should have received a copy of the GNU General Public
14 : * License along with this program; if not, write to the
15 : * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 : * Boston, MA 021110-1307, USA.
17 : */
18 :
19 : #include <linux/fs.h>
20 : #include <linux/pagemap.h>
21 : #include <linux/highmem.h>
22 : #include <linux/time.h>
23 : #include <linux/init.h>
24 : #include <linux/string.h>
25 : #include <linux/backing-dev.h>
26 : #include <linux/mpage.h>
27 : #include <linux/aio.h>
28 : #include <linux/falloc.h>
29 : #include <linux/swap.h>
30 : #include <linux/writeback.h>
31 : #include <linux/statfs.h>
32 : #include <linux/compat.h>
33 : #include <linux/slab.h>
34 : #include <linux/btrfs.h>
35 : #include "ctree.h"
36 : #include "disk-io.h"
37 : #include "transaction.h"
38 : #include "btrfs_inode.h"
39 : #include "print-tree.h"
40 : #include "tree-log.h"
41 : #include "locking.h"
42 : #include "volumes.h"
43 : #include "qgroup.h"
44 :
45 : static struct kmem_cache *btrfs_inode_defrag_cachep;
46 : /*
47 : * when auto defrag is enabled we
48 : * queue up these defrag structs to remember which
49 : * inodes need defragging passes
50 : */
51 : struct inode_defrag {
52 : struct rb_node rb_node;
53 : /* objectid */
54 : u64 ino;
55 : /*
56 : * transid where the defrag was added, we search for
57 : * extents newer than this
58 : */
59 : u64 transid;
60 :
61 : /* root objectid */
62 : u64 root;
63 :
64 : /* last offset we were able to defrag */
65 : u64 last_offset;
66 :
67 : /* if we've wrapped around back to zero once already */
68 : int cycled;
69 : };
70 :
71 : static int __compare_inode_defrag(struct inode_defrag *defrag1,
72 : struct inode_defrag *defrag2)
73 : {
74 0 : if (defrag1->root > defrag2->root)
75 : return 1;
76 0 : else if (defrag1->root < defrag2->root)
77 : return -1;
78 0 : else if (defrag1->ino > defrag2->ino)
79 : return 1;
80 0 : else if (defrag1->ino < defrag2->ino)
81 : return -1;
82 : else
83 : return 0;
84 : }
85 :
86 : /* pop a record for an inode into the defrag tree. The lock
87 : * must be held already
88 : *
89 : * If you're inserting a record for an older transid than an
90 : * existing record, the transid already in the tree is lowered
91 : *
92 : * If an existing record is found the defrag item you
93 : * pass in is freed
94 : */
95 0 : static int __btrfs_add_inode_defrag(struct inode *inode,
96 0 : struct inode_defrag *defrag)
97 : {
98 0 : struct btrfs_root *root = BTRFS_I(inode)->root;
99 0 : struct inode_defrag *entry;
100 : struct rb_node **p;
101 : struct rb_node *parent = NULL;
102 : int ret;
103 :
104 0 : p = &root->fs_info->defrag_inodes.rb_node;
105 0 : while (*p) {
106 : parent = *p;
107 : entry = rb_entry(parent, struct inode_defrag, rb_node);
108 :
109 : ret = __compare_inode_defrag(defrag, entry);
110 0 : if (ret < 0)
111 0 : p = &parent->rb_left;
112 0 : else if (ret > 0)
113 0 : p = &parent->rb_right;
114 : else {
115 : /* if we're reinserting an entry for
116 : * an old defrag run, make sure to
117 : * lower the transid of our existing record
118 : */
119 0 : if (defrag->transid < entry->transid)
120 0 : entry->transid = defrag->transid;
121 0 : if (defrag->last_offset > entry->last_offset)
122 0 : entry->last_offset = defrag->last_offset;
123 : return -EEXIST;
124 : }
125 : }
126 : set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
127 0 : rb_link_node(&defrag->rb_node, parent, p);
128 0 : rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);
129 0 : return 0;
130 : }
131 :
132 : static inline int __need_auto_defrag(struct btrfs_root *root)
133 : {
134 12202 : if (!btrfs_test_opt(root, AUTO_DEFRAG))
135 : return 0;
136 :
137 0 : if (btrfs_fs_closing(root->fs_info))
138 : return 0;
139 :
140 : return 1;
141 : }
142 :
143 : /*
144 : * insert a defrag record for this inode if auto defrag is
145 : * enabled
146 : */
147 11907 : int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
148 : struct inode *inode)
149 : {
150 11907 : struct btrfs_root *root = BTRFS_I(inode)->root;
151 : struct inode_defrag *defrag;
152 : u64 transid;
153 : int ret;
154 :
155 11907 : if (!__need_auto_defrag(root))
156 : return 0;
157 :
158 0 : if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))
159 : return 0;
160 :
161 0 : if (trans)
162 0 : transid = trans->transid;
163 : else
164 0 : transid = BTRFS_I(inode)->root->last_trans;
165 :
166 0 : defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
167 0 : if (!defrag)
168 : return -ENOMEM;
169 :
170 0 : defrag->ino = btrfs_ino(inode);
171 0 : defrag->transid = transid;
172 0 : defrag->root = root->root_key.objectid;
173 :
174 0 : spin_lock(&root->fs_info->defrag_inodes_lock);
175 0 : if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) {
176 : /*
177 : * If we set IN_DEFRAG flag and evict the inode from memory,
178 : * and then re-read this inode, this new inode doesn't have
179 : * IN_DEFRAG flag. At the case, we may find the existed defrag.
180 : */
181 0 : ret = __btrfs_add_inode_defrag(inode, defrag);
182 0 : if (ret)
183 0 : kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
184 : } else {
185 0 : kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
186 : }
187 0 : spin_unlock(&root->fs_info->defrag_inodes_lock);
188 0 : return 0;
189 : }
190 :
191 : /*
192 : * Requeue the defrag object. If there is a defrag object that points to
193 : * the same inode in the tree, we will merge them together (by
194 : * __btrfs_add_inode_defrag()) and free the one that we want to requeue.
195 : */
196 0 : static void btrfs_requeue_inode_defrag(struct inode *inode,
197 : struct inode_defrag *defrag)
198 : {
199 0 : struct btrfs_root *root = BTRFS_I(inode)->root;
200 : int ret;
201 :
202 0 : if (!__need_auto_defrag(root))
203 : goto out;
204 :
205 : /*
206 : * Here we don't check the IN_DEFRAG flag, because we need merge
207 : * them together.
208 : */
209 0 : spin_lock(&root->fs_info->defrag_inodes_lock);
210 0 : ret = __btrfs_add_inode_defrag(inode, defrag);
211 0 : spin_unlock(&root->fs_info->defrag_inodes_lock);
212 0 : if (ret)
213 : goto out;
214 0 : return;
215 : out:
216 0 : kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
217 : }
218 :
219 : /*
220 : * pick the defragable inode that we want, if it doesn't exist, we will get
221 : * the next one.
222 : */
223 : static struct inode_defrag *
224 0 : btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino)
225 : {
226 0 : struct inode_defrag *entry = NULL;
227 : struct inode_defrag tmp;
228 : struct rb_node *p;
229 : struct rb_node *parent = NULL;
230 : int ret;
231 :
232 : tmp.ino = ino;
233 : tmp.root = root;
234 :
235 : spin_lock(&fs_info->defrag_inodes_lock);
236 0 : p = fs_info->defrag_inodes.rb_node;
237 0 : while (p) {
238 : parent = p;
239 : entry = rb_entry(parent, struct inode_defrag, rb_node);
240 :
241 : ret = __compare_inode_defrag(&tmp, entry);
242 0 : if (ret < 0)
243 0 : p = parent->rb_left;
244 0 : else if (ret > 0)
245 0 : p = parent->rb_right;
246 : else
247 : goto out;
248 : }
249 :
250 0 : if (parent && __compare_inode_defrag(&tmp, entry) > 0) {
251 0 : parent = rb_next(parent);
252 0 : if (parent)
253 : entry = rb_entry(parent, struct inode_defrag, rb_node);
254 : else
255 : entry = NULL;
256 : }
257 : out:
258 0 : if (entry)
259 0 : rb_erase(parent, &fs_info->defrag_inodes);
260 : spin_unlock(&fs_info->defrag_inodes_lock);
261 0 : return entry;
262 : }
263 :
264 221 : void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
265 : {
266 : struct inode_defrag *defrag;
267 : struct rb_node *node;
268 :
269 : spin_lock(&fs_info->defrag_inodes_lock);
270 221 : node = rb_first(&fs_info->defrag_inodes);
271 442 : while (node) {
272 0 : rb_erase(node, &fs_info->defrag_inodes);
273 : defrag = rb_entry(node, struct inode_defrag, rb_node);
274 0 : kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
275 :
276 0 : if (need_resched()) {
277 : spin_unlock(&fs_info->defrag_inodes_lock);
278 0 : cond_resched();
279 : spin_lock(&fs_info->defrag_inodes_lock);
280 : }
281 :
282 0 : node = rb_first(&fs_info->defrag_inodes);
283 : }
284 : spin_unlock(&fs_info->defrag_inodes_lock);
285 221 : }
286 :
287 : #define BTRFS_DEFRAG_BATCH 1024
288 :
289 0 : static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
290 : struct inode_defrag *defrag)
291 : {
292 : struct btrfs_root *inode_root;
293 : struct inode *inode;
294 : struct btrfs_key key;
295 : struct btrfs_ioctl_defrag_range_args range;
296 : int num_defrag;
297 : int index;
298 : int ret;
299 :
300 : /* get the inode */
301 0 : key.objectid = defrag->root;
302 : btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
303 0 : key.offset = (u64)-1;
304 :
305 0 : index = srcu_read_lock(&fs_info->subvol_srcu);
306 :
307 : inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
308 0 : if (IS_ERR(inode_root)) {
309 0 : ret = PTR_ERR(inode_root);
310 0 : goto cleanup;
311 : }
312 :
313 0 : key.objectid = defrag->ino;
314 : btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
315 0 : key.offset = 0;
316 0 : inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
317 0 : if (IS_ERR(inode)) {
318 0 : ret = PTR_ERR(inode);
319 0 : goto cleanup;
320 : }
321 : srcu_read_unlock(&fs_info->subvol_srcu, index);
322 :
323 : /* do a chunk of defrag */
324 : clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
325 0 : memset(&range, 0, sizeof(range));
326 0 : range.len = (u64)-1;
327 0 : range.start = defrag->last_offset;
328 :
329 0 : sb_start_write(fs_info->sb);
330 0 : num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
331 : BTRFS_DEFRAG_BATCH);
332 0 : sb_end_write(fs_info->sb);
333 : /*
334 : * if we filled the whole defrag batch, there
335 : * must be more work to do. Queue this defrag
336 : * again
337 : */
338 0 : if (num_defrag == BTRFS_DEFRAG_BATCH) {
339 0 : defrag->last_offset = range.start;
340 0 : btrfs_requeue_inode_defrag(inode, defrag);
341 0 : } else if (defrag->last_offset && !defrag->cycled) {
342 : /*
343 : * we didn't fill our defrag batch, but
344 : * we didn't start at zero. Make sure we loop
345 : * around to the start of the file.
346 : */
347 0 : defrag->last_offset = 0;
348 0 : defrag->cycled = 1;
349 0 : btrfs_requeue_inode_defrag(inode, defrag);
350 : } else {
351 0 : kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
352 : }
353 :
354 0 : iput(inode);
355 0 : return 0;
356 : cleanup:
357 : srcu_read_unlock(&fs_info->subvol_srcu, index);
358 0 : kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
359 0 : return ret;
360 : }
361 :
362 : /*
363 : * run through the list of inodes in the FS that need
364 : * defragging
365 : */
366 295 : int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
367 : {
368 : struct inode_defrag *defrag;
369 : u64 first_ino = 0;
370 : u64 root_objectid = 0;
371 :
372 295 : atomic_inc(&fs_info->defrag_running);
373 : while (1) {
374 : /* Pause the auto defragger. */
375 295 : if (test_bit(BTRFS_FS_STATE_REMOUNTING,
376 : &fs_info->fs_state))
377 : break;
378 :
379 590 : if (!__need_auto_defrag(fs_info->tree_root))
380 : break;
381 :
382 : /* find an inode to defrag */
383 0 : defrag = btrfs_pick_defrag_inode(fs_info, root_objectid,
384 : first_ino);
385 0 : if (!defrag) {
386 0 : if (root_objectid || first_ino) {
387 : root_objectid = 0;
388 : first_ino = 0;
389 0 : continue;
390 : } else {
391 : break;
392 : }
393 : }
394 :
395 0 : first_ino = defrag->ino + 1;
396 0 : root_objectid = defrag->root;
397 :
398 0 : __btrfs_run_defrag_inode(fs_info, defrag);
399 : }
400 : atomic_dec(&fs_info->defrag_running);
401 :
402 : /*
403 : * during unmount, we use the transaction_wait queue to
404 : * wait for the defragger to stop
405 : */
406 295 : wake_up(&fs_info->transaction_wait);
407 295 : return 0;
408 : }
409 :
410 : /* simple helper to fault in pages and copy. This should go away
411 : * and be replaced with calls into generic code.
412 : */
413 118028 : static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
414 : size_t write_bytes,
415 : struct page **prepared_pages,
416 : struct iov_iter *i)
417 : {
418 : size_t copied = 0;
419 : size_t total_copied = 0;
420 : int pg = 0;
421 118028 : int offset = pos & (PAGE_CACHE_SIZE - 1);
422 :
423 1268380 : while (write_bytes > 0) {
424 1150351 : size_t count = min_t(size_t,
425 : PAGE_CACHE_SIZE - offset, write_bytes);
426 1150351 : struct page *page = prepared_pages[pg];
427 : /*
428 : * Copy data from userspace to the current page
429 : */
430 1150351 : copied = iov_iter_copy_from_user_atomic(page, i, offset, count);
431 :
432 : /* Flush processor's dcache for this page */
433 : flush_dcache_page(page);
434 :
435 : /*
436 : * if we get a partial write, we can end up with
437 : * partially up to date pages. These add
438 : * a lot of complexity, so make sure they don't
439 : * happen by forcing this copy to be retried.
440 : *
441 : * The rest of the btrfs_file_write code will fall
442 : * back to page at a time copies after we return 0.
443 : */
444 1150357 : if (!PageUptodate(page) && copied < count)
445 : copied = 0;
446 :
447 1150357 : iov_iter_advance(i, copied);
448 1150351 : write_bytes -= copied;
449 1150351 : total_copied += copied;
450 :
451 : /* Return to btrfs_file_write_iter to fault page */
452 1150351 : if (unlikely(copied == 0))
453 : break;
454 :
455 1150352 : if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {
456 42920 : offset += copied;
457 : } else {
458 1107432 : pg++;
459 : offset = 0;
460 : }
461 : }
462 118028 : return total_copied;
463 : }
464 :
465 : /*
466 : * unlocks pages after btrfs_file_write is done with them
467 : */
468 118029 : static void btrfs_drop_pages(struct page **pages, size_t num_pages)
469 : {
470 : size_t i;
471 1268397 : for (i = 0; i < num_pages; i++) {
472 : /* page checked is some magic around finding pages that
473 : * have been modified without going through btrfs_set_page_dirty
474 : * clear it here. There should be no need to mark the pages
475 : * accessed as prepare_pages should have marked them accessed
476 : * in prepare_pages via find_or_create_page()
477 : */
478 1150369 : ClearPageChecked(pages[i]);
479 1150367 : unlock_page(pages[i]);
480 1150366 : page_cache_release(pages[i]);
481 : }
482 118028 : }
483 :
484 : /*
485 : * after copy_from_user, pages need to be dirtied and we need to make
486 : * sure holes are created between the current EOF and the start of
487 : * any next extents (if required).
488 : *
489 : * this also makes the decision about creating an inline extent vs
490 : * doing real data extents, marking pages dirty and delalloc as required.
491 : */
492 244170 : int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
493 : struct page **pages, size_t num_pages,
494 : loff_t pos, size_t write_bytes,
495 : struct extent_state **cached)
496 : {
497 : int err = 0;
498 : int i;
499 : u64 num_bytes;
500 : u64 start_pos;
501 : u64 end_of_last_block;
502 122085 : u64 end_pos = pos + write_bytes;
503 : loff_t isize = i_size_read(inode);
504 :
505 122085 : start_pos = pos & ~((u64)root->sectorsize - 1);
506 122085 : num_bytes = ALIGN(write_bytes + pos - start_pos, root->sectorsize);
507 :
508 122085 : end_of_last_block = start_pos + num_bytes - 1;
509 122085 : err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
510 : cached);
511 122085 : if (err)
512 : return err;
513 :
514 1339903 : for (i = 0; i < num_pages; i++) {
515 1339903 : struct page *p = pages[i];
516 : SetPageUptodate(p);
517 : ClearPageChecked(p);
518 1339905 : set_page_dirty(p);
519 : }
520 :
521 : /*
522 : * we've only changed i_size in ram, and we haven't updated
523 : * the disk i_size. There is no need to log the inode
524 : * at this time.
525 : */
526 122085 : if (end_pos > isize)
527 114746 : i_size_write(inode, end_pos);
528 : return 0;
529 : }
530 :
531 : /*
532 : * this drops all the extents in the cache that intersect the range
533 : * [start, end]. Existing extents are split as required.
534 : */
535 116261 : void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
536 : int skip_pinned)
537 : {
538 : struct extent_map *em;
539 : struct extent_map *split = NULL;
540 : struct extent_map *split2 = NULL;
541 116261 : struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
542 116261 : u64 len = end - start + 1;
543 : u64 gen;
544 : int ret;
545 : int testend = 1;
546 : unsigned long flags;
547 : int compressed = 0;
548 : bool modified;
549 :
550 116261 : WARN_ON(end < start);
551 116266 : if (end == (u64)-1) {
552 : len = (u64)-1;
553 : testend = 0;
554 : }
555 : while (1) {
556 : int no_splits = 0;
557 :
558 : modified = false;
559 178927 : if (!split)
560 120952 : split = alloc_extent_map();
561 178913 : if (!split2)
562 145724 : split2 = alloc_extent_map();
563 178911 : if (!split || !split2)
564 : no_splits = 1;
565 :
566 178911 : write_lock(&em_tree->lock);
567 178935 : em = lookup_extent_mapping(em_tree, start, len);
568 178930 : if (!em) {
569 : write_unlock(&em_tree->lock);
570 : break;
571 : }
572 62660 : flags = em->flags;
573 62660 : gen = em->generation;
574 63734 : if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
575 0 : if (testend && em->start + em->len >= start + len) {
576 0 : free_extent_map(em);
577 : write_unlock(&em_tree->lock);
578 : break;
579 : }
580 0 : start = em->start + em->len;
581 0 : if (testend)
582 0 : len = start + len - (em->start + em->len);
583 0 : free_extent_map(em);
584 : write_unlock(&em_tree->lock);
585 0 : continue;
586 : }
587 : compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
588 : clear_bit(EXTENT_FLAG_PINNED, &em->flags);
589 : clear_bit(EXTENT_FLAG_LOGGING, &flags);
590 125328 : modified = !list_empty(&em->list);
591 62664 : if (no_splits)
592 : goto next;
593 :
594 62664 : if (em->start < start) {
595 29457 : split->start = em->start;
596 29457 : split->len = start - em->start;
597 :
598 29457 : if (em->block_start < EXTENT_MAP_LAST_BYTE) {
599 2294 : split->orig_start = em->orig_start;
600 2294 : split->block_start = em->block_start;
601 :
602 2294 : if (compressed)
603 54 : split->block_len = em->block_len;
604 : else
605 2240 : split->block_len = split->len;
606 2294 : split->orig_block_len = max(split->block_len,
607 : em->orig_block_len);
608 2294 : split->ram_bytes = em->ram_bytes;
609 : } else {
610 27163 : split->orig_start = split->start;
611 27163 : split->block_len = 0;
612 27163 : split->block_start = em->block_start;
613 27163 : split->orig_block_len = 0;
614 27163 : split->ram_bytes = split->len;
615 : }
616 :
617 29457 : split->generation = gen;
618 29457 : split->bdev = em->bdev;
619 29457 : split->flags = flags;
620 29457 : split->compress_type = em->compress_type;
621 29457 : replace_extent_mapping(em_tree, em, split, modified);
622 29457 : free_extent_map(split);
623 : split = split2;
624 : split2 = NULL;
625 : }
626 62663 : if (testend && em->start + em->len > start + len) {
627 4689 : u64 diff = start + len - em->start;
628 :
629 4689 : split->start = start + len;
630 4689 : split->len = em->start + em->len - (start + len);
631 4689 : split->bdev = em->bdev;
632 4689 : split->flags = flags;
633 4689 : split->compress_type = em->compress_type;
634 4689 : split->generation = gen;
635 :
636 4689 : if (em->block_start < EXTENT_MAP_LAST_BYTE) {
637 2777 : split->orig_block_len = max(em->block_len,
638 : em->orig_block_len);
639 :
640 2777 : split->ram_bytes = em->ram_bytes;
641 2777 : if (compressed) {
642 11 : split->block_len = em->block_len;
643 11 : split->block_start = em->block_start;
644 11 : split->orig_start = em->orig_start;
645 : } else {
646 2766 : split->block_len = split->len;
647 5532 : split->block_start = em->block_start
648 2766 : + diff;
649 2766 : split->orig_start = em->orig_start;
650 : }
651 : } else {
652 1912 : split->ram_bytes = split->len;
653 1912 : split->orig_start = split->start;
654 1912 : split->block_len = 0;
655 1912 : split->block_start = em->block_start;
656 1912 : split->orig_block_len = 0;
657 : }
658 :
659 4689 : if (extent_map_in_tree(em)) {
660 3246 : replace_extent_mapping(em_tree, em, split,
661 : modified);
662 : } else {
663 1443 : ret = add_extent_mapping(em_tree, split,
664 : modified);
665 : ASSERT(ret == 0); /* Logic error */
666 : }
667 4689 : free_extent_map(split);
668 : split = NULL;
669 : }
670 : next:
671 62663 : if (extent_map_in_tree(em))
672 29960 : remove_extent_mapping(em_tree, em);
673 : write_unlock(&em_tree->lock);
674 :
675 : /* once for us */
676 62664 : free_extent_map(em);
677 : /* once for the tree*/
678 62663 : free_extent_map(em);
679 : }
680 116269 : if (split)
681 116270 : free_extent_map(split);
682 116270 : if (split2)
683 116270 : free_extent_map(split2);
684 116271 : }
685 :
686 : /*
687 : * this is very complex, but the basic idea is to drop all extents
688 : * in the range start - end. hint_block is filled in with a block number
689 : * that would be a good hint to the block allocator for this file.
690 : *
691 : * If an extent intersects the range but is not entirely inside the range
692 : * it is either truncated or split. Anything entirely inside the range
693 : * is deleted from the tree.
694 : */
695 64833 : int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
696 : struct btrfs_root *root, struct inode *inode,
697 : struct btrfs_path *path, u64 start, u64 end,
698 : u64 *drop_end, int drop_cache,
699 : int replace_extent,
700 : u32 extent_item_size,
701 : int *key_inserted)
702 : {
703 122552 : struct extent_buffer *leaf;
704 : struct btrfs_file_extent_item *fi;
705 : struct btrfs_key key;
706 : struct btrfs_key new_key;
707 : u64 ino = btrfs_ino(inode);
708 : u64 search_start = start;
709 : u64 disk_bytenr = 0;
710 : u64 num_bytes = 0;
711 : u64 extent_offset = 0;
712 : u64 extent_end = 0;
713 : int del_nr = 0;
714 : int del_slot = 0;
715 : int extent_type;
716 : int recow;
717 : int ret;
718 : int modify_tree = -1;
719 : int update_refs;
720 : int found = 0;
721 : int leafs_visited = 0;
722 :
723 64833 : if (drop_cache)
724 9063 : btrfs_drop_extent_cache(inode, start, end - 1, 0);
725 :
726 64837 : if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent)
727 : modify_tree = 0;
728 :
729 71215 : update_refs = (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
730 6378 : root == root->fs_info->tree_root);
731 : while (1) {
732 : recow = 0;
733 65386 : ret = btrfs_lookup_file_extent(trans, root, path, ino,
734 : search_start, modify_tree);
735 65394 : if (ret < 0)
736 : break;
737 65395 : if (ret > 0 && path->slots[0] > 0 && search_start == start) {
738 55498 : leaf = path->nodes[0];
739 55498 : btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
740 110997 : if (key.objectid == ino &&
741 55499 : key.type == BTRFS_EXTENT_DATA_KEY)
742 38625 : path->slots[0]--;
743 : }
744 : ret = 0;
745 65395 : leafs_visited++;
746 : next_slot:
747 76119 : leaf = path->nodes[0];
748 152238 : if (path->slots[0] >= btrfs_header_nritems(leaf)) {
749 5778 : BUG_ON(del_nr > 0);
750 5778 : ret = btrfs_next_leaf(root, path);
751 5781 : if (ret < 0)
752 : break;
753 5781 : if (ret > 0) {
754 : ret = 0;
755 : break;
756 : }
757 271 : leafs_visited++;
758 271 : leaf = path->nodes[0];
759 : recow = 1;
760 : }
761 :
762 70612 : btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
763 120858 : if (key.objectid > ino ||
764 100496 : key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)
765 : break;
766 :
767 100362 : fi = btrfs_item_ptr(leaf, path->slots[0],
768 : struct btrfs_file_extent_item);
769 50179 : extent_type = btrfs_file_extent_type(leaf, fi);
770 :
771 50179 : if (extent_type == BTRFS_FILE_EXTENT_REG ||
772 : extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
773 : disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
774 : num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
775 : extent_offset = btrfs_file_extent_offset(leaf, fi);
776 100267 : extent_end = key.offset +
777 : btrfs_file_extent_num_bytes(leaf, fi);
778 44 : } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
779 88 : extent_end = key.offset +
780 44 : btrfs_file_extent_inline_len(leaf,
781 : path->slots[0], fi);
782 : } else {
783 0 : WARN_ON(1);
784 : extent_end = search_start;
785 : }
786 :
787 : /*
788 : * Don't skip extent items representing 0 byte lengths. They
789 : * used to be created (bug) if while punching holes we hit
790 : * -ENOSPC condition. So if we find one here, just ensure we
791 : * delete it, otherwise we would insert a new file extent item
792 : * with the same key (offset) as that 0 bytes length file
793 : * extent item in the call to setup_items_for_insert() later
794 : * in this function.
795 : */
796 50178 : if (extent_end == key.offset && extent_end >= search_start)
797 : goto delete_extent_item;
798 :
799 50178 : if (extent_end <= search_start) {
800 8837 : path->slots[0]++;
801 8837 : goto next_slot;
802 : }
803 :
804 : found = 1;
805 41341 : search_start = max(key.offset, start);
806 41341 : if (recow || !modify_tree) {
807 : modify_tree = -1;
808 482 : btrfs_release_path(path);
809 482 : continue;
810 : }
811 :
812 : /*
813 : * | - range to drop - |
814 : * | -------- extent -------- |
815 : */
816 40859 : if (start > key.offset && end < extent_end) {
817 1239 : BUG_ON(del_nr > 0);
818 1239 : if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
819 : ret = -EOPNOTSUPP;
820 : break;
821 : }
822 :
823 1239 : memcpy(&new_key, &key, sizeof(new_key));
824 1239 : new_key.offset = start;
825 1239 : ret = btrfs_duplicate_item(trans, root, path,
826 : &new_key);
827 1239 : if (ret == -EAGAIN) {
828 2 : btrfs_release_path(path);
829 2 : continue;
830 : }
831 1237 : if (ret < 0)
832 : break;
833 :
834 1237 : leaf = path->nodes[0];
835 2474 : fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
836 : struct btrfs_file_extent_item);
837 1237 : btrfs_set_file_extent_num_bytes(leaf, fi,
838 1237 : start - key.offset);
839 :
840 2474 : fi = btrfs_item_ptr(leaf, path->slots[0],
841 : struct btrfs_file_extent_item);
842 :
843 1237 : extent_offset += start - key.offset;
844 : btrfs_set_file_extent_offset(leaf, fi, extent_offset);
845 1237 : btrfs_set_file_extent_num_bytes(leaf, fi,
846 : extent_end - start);
847 1237 : btrfs_mark_buffer_dirty(leaf);
848 :
849 1237 : if (update_refs && disk_bytenr > 0) {
850 675 : ret = btrfs_inc_extent_ref(trans, root,
851 : disk_bytenr, num_bytes, 0,
852 : root->root_key.objectid,
853 : new_key.objectid,
854 : start - extent_offset, 1);
855 675 : BUG_ON(ret); /* -ENOMEM */
856 : }
857 1237 : key.offset = start;
858 : }
859 : /*
860 : * | ---- range to drop ----- |
861 : * | -------- extent -------- |
862 : */
863 40857 : if (start <= key.offset && end < extent_end) {
864 3244 : if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
865 : ret = -EOPNOTSUPP;
866 : break;
867 : }
868 :
869 3244 : memcpy(&new_key, &key, sizeof(new_key));
870 3244 : new_key.offset = end;
871 3244 : btrfs_set_item_key_safe(root, path, &new_key);
872 :
873 3244 : extent_offset += end - key.offset;
874 : btrfs_set_file_extent_offset(leaf, fi, extent_offset);
875 3244 : btrfs_set_file_extent_num_bytes(leaf, fi,
876 : extent_end - end);
877 3244 : btrfs_mark_buffer_dirty(leaf);
878 3244 : if (update_refs && disk_bytenr > 0)
879 1377 : inode_sub_bytes(inode, end - key.offset);
880 : break;
881 : }
882 :
883 : search_start = extent_end;
884 : /*
885 : * | ---- range to drop ----- |
886 : * | -------- extent -------- |
887 : */
888 37613 : if (start > key.offset && end >= extent_end) {
889 28524 : BUG_ON(del_nr > 0);
890 28524 : if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
891 : ret = -EOPNOTSUPP;
892 : break;
893 : }
894 :
895 28524 : btrfs_set_file_extent_num_bytes(leaf, fi,
896 : start - key.offset);
897 28524 : btrfs_mark_buffer_dirty(leaf);
898 28524 : if (update_refs && disk_bytenr > 0)
899 1106 : inode_sub_bytes(inode, extent_end - start);
900 28524 : if (end == extent_end)
901 : break;
902 :
903 232 : path->slots[0]++;
904 232 : goto next_slot;
905 : }
906 :
907 : /*
908 : * | ---- range to drop ----- |
909 : * | ------ extent ------ |
910 : */
911 9089 : if (start <= key.offset && end >= extent_end) {
912 : delete_extent_item:
913 9089 : if (del_nr == 0) {
914 7548 : del_slot = path->slots[0];
915 : del_nr = 1;
916 : } else {
917 1541 : BUG_ON(del_slot + del_nr != path->slots[0]);
918 1541 : del_nr++;
919 : }
920 :
921 18178 : if (update_refs &&
922 9089 : extent_type == BTRFS_FILE_EXTENT_INLINE) {
923 44 : inode_sub_bytes(inode,
924 44 : extent_end - key.offset);
925 44 : extent_end = ALIGN(extent_end,
926 : root->sectorsize);
927 9045 : } else if (update_refs && disk_bytenr > 0) {
928 7437 : ret = btrfs_free_extent(trans, root,
929 : disk_bytenr, num_bytes, 0,
930 : root->root_key.objectid,
931 7437 : key.objectid, key.offset -
932 : extent_offset, 0);
933 7438 : BUG_ON(ret); /* -ENOMEM */
934 7438 : inode_sub_bytes(inode,
935 7438 : extent_end - key.offset);
936 : }
937 :
938 9090 : if (end == extent_end)
939 : break;
940 :
941 3440 : if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) {
942 1655 : path->slots[0]++;
943 1655 : goto next_slot;
944 : }
945 :
946 65 : ret = btrfs_del_items(trans, root, path, del_slot,
947 : del_nr);
948 65 : if (ret) {
949 0 : btrfs_abort_transaction(trans, root, ret);
950 0 : break;
951 : }
952 :
953 : del_nr = 0;
954 : del_slot = 0;
955 :
956 65 : btrfs_release_path(path);
957 65 : continue;
958 : }
959 :
960 0 : BUG_ON(1);
961 : }
962 :
963 64845 : if (!ret && del_nr > 0) {
964 : /*
965 : * Set path->slots[0] to first slot, so that after the delete
966 : * if items are move off from our leaf to its immediate left or
967 : * right neighbor leafs, we end up with a correct and adjusted
968 : * path->slots[0] for our insertion (if replace_extent != 0).
969 : */
970 7484 : path->slots[0] = del_slot;
971 7484 : ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
972 7484 : if (ret)
973 0 : btrfs_abort_transaction(trans, root, ret);
974 : }
975 :
976 64845 : leaf = path->nodes[0];
977 : /*
978 : * If btrfs_del_items() was called, it might have deleted a leaf, in
979 : * which case it unlocked our path, so check path->locks[0] matches a
980 : * write lock.
981 : */
982 124216 : if (!ret && replace_extent && leafs_visited == 1 &&
983 59371 : (path->locks[0] == BTRFS_WRITE_LOCK_BLOCKING ||
984 55149 : path->locks[0] == BTRFS_WRITE_LOCK) &&
985 55149 : btrfs_leaf_free_space(root, leaf) >=
986 55149 : sizeof(struct btrfs_item) + extent_item_size) {
987 :
988 51843 : key.objectid = ino;
989 51843 : key.type = BTRFS_EXTENT_DATA_KEY;
990 51843 : key.offset = start;
991 96556 : if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) {
992 : struct btrfs_key slot_key;
993 :
994 44712 : btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]);
995 44712 : if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
996 27786 : path->slots[0]++;
997 : }
998 51843 : setup_items_for_insert(root, path, &key,
999 : &extent_item_size,
1000 : extent_item_size,
1001 : sizeof(struct btrfs_item) +
1002 : extent_item_size, 1);
1003 51841 : *key_inserted = 1;
1004 : }
1005 :
1006 64843 : if (!replace_extent || !(*key_inserted))
1007 13003 : btrfs_release_path(path);
1008 64840 : if (drop_end)
1009 112 : *drop_end = found ? min(end, extent_end) : end;
1010 64840 : return ret;
1011 : }
1012 :
1013 5100 : int btrfs_drop_extents(struct btrfs_trans_handle *trans,
1014 : struct btrfs_root *root, struct inode *inode, u64 start,
1015 : u64 end, int drop_cache)
1016 : {
1017 : struct btrfs_path *path;
1018 : int ret;
1019 :
1020 5100 : path = btrfs_alloc_path();
1021 5100 : if (!path)
1022 : return -ENOMEM;
1023 5100 : ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL,
1024 : drop_cache, 0, 0, NULL);
1025 5099 : btrfs_free_path(path);
1026 5099 : return ret;
1027 : }
1028 :
1029 23125 : static int extent_mergeable(struct extent_buffer *leaf, int slot,
1030 : u64 objectid, u64 bytenr, u64 orig_offset,
1031 : u64 *start, u64 *end)
1032 : {
1033 : struct btrfs_file_extent_item *fi;
1034 : struct btrfs_key key;
1035 : u64 extent_end;
1036 :
1037 23125 : if (slot < 0 || slot >= btrfs_header_nritems(leaf))
1038 : return 0;
1039 :
1040 11524 : btrfs_item_key_to_cpu(leaf, &key, slot);
1041 11524 : if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
1042 : return 0;
1043 :
1044 3522 : fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
1045 5199 : if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
1046 35 : btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
1047 70 : btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset ||
1048 35 : btrfs_file_extent_compression(leaf, fi) ||
1049 35 : btrfs_file_extent_encryption(leaf, fi) ||
1050 : btrfs_file_extent_other_encoding(leaf, fi))
1051 : return 0;
1052 :
1053 70 : extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
1054 35 : if ((*start && *start != key.offset) || (*end && *end != extent_end))
1055 : return 0;
1056 :
1057 35 : *start = key.offset;
1058 35 : *end = extent_end;
1059 35 : return 1;
1060 : }
1061 :
1062 : /*
1063 : * Mark extent in the range start - end as written.
1064 : *
1065 : * This changes extent type from 'pre-allocated' to 'regular'. If only
1066 : * part of extent is marked as written, the extent will be split into
1067 : * two or three.
1068 : */
1069 5625 : int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
1070 : struct inode *inode, u64 start, u64 end)
1071 : {
1072 5625 : struct btrfs_root *root = BTRFS_I(inode)->root;
1073 : struct extent_buffer *leaf;
1074 : struct btrfs_path *path;
1075 : struct btrfs_file_extent_item *fi;
1076 : struct btrfs_key key;
1077 : struct btrfs_key new_key;
1078 : u64 bytenr;
1079 : u64 num_bytes;
1080 : u64 extent_end;
1081 : u64 orig_offset;
1082 : u64 other_start;
1083 : u64 other_end;
1084 : u64 split;
1085 : int del_nr = 0;
1086 : int del_slot = 0;
1087 : int recow;
1088 : int ret;
1089 : u64 ino = btrfs_ino(inode);
1090 :
1091 5625 : path = btrfs_alloc_path();
1092 5624 : if (!path)
1093 : return -ENOMEM;
1094 : again:
1095 : recow = 0;
1096 : split = start;
1097 5625 : key.objectid = ino;
1098 5625 : key.type = BTRFS_EXTENT_DATA_KEY;
1099 5625 : key.offset = split;
1100 :
1101 5625 : ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1102 5625 : if (ret < 0)
1103 : goto out;
1104 5625 : if (ret > 0 && path->slots[0] > 0)
1105 441 : path->slots[0]--;
1106 :
1107 5625 : leaf = path->nodes[0];
1108 5625 : btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1109 5625 : BUG_ON(key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY);
1110 11250 : fi = btrfs_item_ptr(leaf, path->slots[0],
1111 : struct btrfs_file_extent_item);
1112 5625 : BUG_ON(btrfs_file_extent_type(leaf, fi) !=
1113 : BTRFS_FILE_EXTENT_PREALLOC);
1114 11250 : extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
1115 5625 : BUG_ON(key.offset > start || extent_end < end);
1116 :
1117 : bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1118 : num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1119 11250 : orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
1120 5625 : memcpy(&new_key, &key, sizeof(new_key));
1121 :
1122 5625 : if (start == key.offset && end < extent_end) {
1123 179 : other_start = 0;
1124 179 : other_end = start;
1125 179 : if (extent_mergeable(leaf, path->slots[0] - 1,
1126 : ino, bytenr, orig_offset,
1127 : &other_start, &other_end)) {
1128 29 : new_key.offset = end;
1129 29 : btrfs_set_item_key_safe(root, path, &new_key);
1130 58 : fi = btrfs_item_ptr(leaf, path->slots[0],
1131 : struct btrfs_file_extent_item);
1132 29 : btrfs_set_file_extent_generation(leaf, fi,
1133 : trans->transid);
1134 29 : btrfs_set_file_extent_num_bytes(leaf, fi,
1135 : extent_end - end);
1136 29 : btrfs_set_file_extent_offset(leaf, fi,
1137 : end - orig_offset);
1138 58 : fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
1139 : struct btrfs_file_extent_item);
1140 29 : btrfs_set_file_extent_generation(leaf, fi,
1141 : trans->transid);
1142 29 : btrfs_set_file_extent_num_bytes(leaf, fi,
1143 : end - other_start);
1144 29 : btrfs_mark_buffer_dirty(leaf);
1145 29 : goto out;
1146 : }
1147 : }
1148 :
1149 5596 : if (start > key.offset && end == extent_end) {
1150 204 : other_start = end;
1151 204 : other_end = 0;
1152 204 : if (extent_mergeable(leaf, path->slots[0] + 1,
1153 : ino, bytenr, orig_offset,
1154 : &other_start, &other_end)) {
1155 8 : fi = btrfs_item_ptr(leaf, path->slots[0],
1156 : struct btrfs_file_extent_item);
1157 4 : btrfs_set_file_extent_num_bytes(leaf, fi,
1158 4 : start - key.offset);
1159 4 : btrfs_set_file_extent_generation(leaf, fi,
1160 : trans->transid);
1161 4 : path->slots[0]++;
1162 4 : new_key.offset = start;
1163 4 : btrfs_set_item_key_safe(root, path, &new_key);
1164 :
1165 8 : fi = btrfs_item_ptr(leaf, path->slots[0],
1166 : struct btrfs_file_extent_item);
1167 4 : btrfs_set_file_extent_generation(leaf, fi,
1168 : trans->transid);
1169 4 : btrfs_set_file_extent_num_bytes(leaf, fi,
1170 : other_end - start);
1171 4 : btrfs_set_file_extent_offset(leaf, fi,
1172 : start - orig_offset);
1173 4 : btrfs_mark_buffer_dirty(leaf);
1174 4 : goto out;
1175 : }
1176 : }
1177 :
1178 6416 : while (start > key.offset || end < extent_end) {
1179 824 : if (key.offset == start)
1180 : split = end;
1181 :
1182 824 : new_key.offset = split;
1183 824 : ret = btrfs_duplicate_item(trans, root, path, &new_key);
1184 824 : if (ret == -EAGAIN) {
1185 0 : btrfs_release_path(path);
1186 0 : goto again;
1187 : }
1188 824 : if (ret < 0) {
1189 0 : btrfs_abort_transaction(trans, root, ret);
1190 0 : goto out;
1191 : }
1192 :
1193 824 : leaf = path->nodes[0];
1194 1648 : fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
1195 : struct btrfs_file_extent_item);
1196 824 : btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1197 824 : btrfs_set_file_extent_num_bytes(leaf, fi,
1198 824 : split - key.offset);
1199 :
1200 1648 : fi = btrfs_item_ptr(leaf, path->slots[0],
1201 : struct btrfs_file_extent_item);
1202 :
1203 824 : btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1204 824 : btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
1205 824 : btrfs_set_file_extent_num_bytes(leaf, fi,
1206 : extent_end - split);
1207 824 : btrfs_mark_buffer_dirty(leaf);
1208 :
1209 824 : ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
1210 : root->root_key.objectid,
1211 : ino, orig_offset, 1);
1212 824 : BUG_ON(ret); /* -ENOMEM */
1213 :
1214 824 : if (split == start) {
1215 437 : key.offset = start;
1216 : } else {
1217 387 : BUG_ON(start != key.offset);
1218 387 : path->slots[0]--;
1219 : extent_end = end;
1220 : }
1221 : recow = 1;
1222 : }
1223 :
1224 5592 : other_start = end;
1225 5592 : other_end = 0;
1226 5592 : if (extent_mergeable(leaf, path->slots[0] + 1,
1227 : ino, bytenr, orig_offset,
1228 : &other_start, &other_end)) {
1229 1 : if (recow) {
1230 0 : btrfs_release_path(path);
1231 0 : goto again;
1232 : }
1233 1 : extent_end = other_end;
1234 1 : del_slot = path->slots[0] + 1;
1235 1 : del_nr++;
1236 1 : ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
1237 : 0, root->root_key.objectid,
1238 : ino, orig_offset, 0);
1239 1 : BUG_ON(ret); /* -ENOMEM */
1240 : }
1241 5592 : other_start = 0;
1242 5592 : other_end = start;
1243 5592 : if (extent_mergeable(leaf, path->slots[0] - 1,
1244 : ino, bytenr, orig_offset,
1245 : &other_start, &other_end)) {
1246 1 : if (recow) {
1247 0 : btrfs_release_path(path);
1248 0 : goto again;
1249 : }
1250 1 : key.offset = other_start;
1251 1 : del_slot = path->slots[0];
1252 1 : del_nr++;
1253 1 : ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
1254 : 0, root->root_key.objectid,
1255 : ino, orig_offset, 0);
1256 1 : BUG_ON(ret); /* -ENOMEM */
1257 : }
1258 5592 : if (del_nr == 0) {
1259 11180 : fi = btrfs_item_ptr(leaf, path->slots[0],
1260 : struct btrfs_file_extent_item);
1261 : btrfs_set_file_extent_type(leaf, fi,
1262 : BTRFS_FILE_EXTENT_REG);
1263 5590 : btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1264 5590 : btrfs_mark_buffer_dirty(leaf);
1265 : } else {
1266 4 : fi = btrfs_item_ptr(leaf, del_slot - 1,
1267 : struct btrfs_file_extent_item);
1268 : btrfs_set_file_extent_type(leaf, fi,
1269 : BTRFS_FILE_EXTENT_REG);
1270 2 : btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1271 2 : btrfs_set_file_extent_num_bytes(leaf, fi,
1272 2 : extent_end - key.offset);
1273 2 : btrfs_mark_buffer_dirty(leaf);
1274 :
1275 2 : ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
1276 2 : if (ret < 0) {
1277 0 : btrfs_abort_transaction(trans, root, ret);
1278 0 : goto out;
1279 : }
1280 : }
1281 : out:
1282 5625 : btrfs_free_path(path);
1283 5625 : return 0;
1284 : }
1285 :
1286 : /*
1287 : * on error we return an unlocked page and the error value
1288 : * on success we return a locked page and 0
1289 : */
1290 236072 : static int prepare_uptodate_page(struct page *page, u64 pos,
1291 : bool force_uptodate)
1292 : {
1293 : int ret = 0;
1294 :
1295 314331 : if (((pos & (PAGE_CACHE_SIZE - 1)) || force_uptodate) &&
1296 : !PageUptodate(page)) {
1297 25436 : ret = btrfs_readpage(NULL, page);
1298 25436 : if (ret)
1299 : return ret;
1300 25436 : lock_page(page);
1301 25436 : if (!PageUptodate(page)) {
1302 0 : unlock_page(page);
1303 0 : return -EIO;
1304 : }
1305 : }
1306 : return 0;
1307 : }
1308 :
1309 : /*
1310 : * this just gets pages into the page cache and locks them down.
1311 : */
1312 118036 : static noinline int prepare_pages(struct inode *inode, struct page **pages,
1313 : size_t num_pages, loff_t pos,
1314 : size_t write_bytes, bool force_uptodate)
1315 : {
1316 : int i;
1317 118036 : unsigned long index = pos >> PAGE_CACHE_SHIFT;
1318 118036 : gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
1319 : int err = 0;
1320 : int faili;
1321 :
1322 1150365 : for (i = 0; i < num_pages; i++) {
1323 2300721 : pages[i] = find_or_create_page(inode->i_mapping, index + i,
1324 : mask | __GFP_WRITE);
1325 1150360 : if (!pages[i]) {
1326 0 : faili = i - 1;
1327 : err = -ENOMEM;
1328 : goto fail;
1329 : }
1330 :
1331 1150360 : if (i == 0)
1332 118036 : err = prepare_uptodate_page(pages[i], pos,
1333 : force_uptodate);
1334 1150363 : if (i == num_pages - 1)
1335 118036 : err = prepare_uptodate_page(pages[i],
1336 : pos + write_bytes, false);
1337 1150363 : if (err) {
1338 0 : page_cache_release(pages[i]);
1339 0 : faili = i - 1;
1340 : goto fail;
1341 : }
1342 1150363 : wait_on_page_writeback(pages[i]);
1343 : }
1344 :
1345 : return 0;
1346 : fail:
1347 0 : while (faili >= 0) {
1348 0 : unlock_page(pages[faili]);
1349 0 : page_cache_release(pages[faili]);
1350 0 : faili--;
1351 : }
1352 : return err;
1353 :
1354 : }
1355 :
1356 : /*
1357 : * This function locks the extent and properly waits for data=ordered extents
1358 : * to finish before allowing the pages to be modified if need.
1359 : *
1360 : * The return value:
1361 : * 1 - the extent is locked
1362 : * 0 - the extent is not locked, and everything is OK
1363 : * -EAGAIN - need re-prepare the pages
1364 : * the other < 0 number - Something wrong happens
1365 : */
1366 : static noinline int
1367 118036 : lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages,
1368 : size_t num_pages, loff_t pos,
1369 : u64 *lockstart, u64 *lockend,
1370 : struct extent_state **cached_state)
1371 : {
1372 : u64 start_pos;
1373 : u64 last_pos;
1374 : int i;
1375 : int ret = 0;
1376 :
1377 118036 : start_pos = pos & ~((u64)PAGE_CACHE_SIZE - 1);
1378 118036 : last_pos = start_pos + ((u64)num_pages << PAGE_CACHE_SHIFT) - 1;
1379 :
1380 118036 : if (start_pos < inode->i_size) {
1381 : struct btrfs_ordered_extent *ordered;
1382 35820 : lock_extent_bits(&BTRFS_I(inode)->io_tree,
1383 : start_pos, last_pos, 0, cached_state);
1384 35820 : ordered = btrfs_lookup_ordered_range(inode, start_pos,
1385 35820 : last_pos - start_pos + 1);
1386 35828 : if (ordered &&
1387 16 : ordered->file_offset + ordered->len > start_pos &&
1388 : ordered->file_offset <= last_pos) {
1389 8 : unlock_extent_cached(&BTRFS_I(inode)->io_tree,
1390 : start_pos, last_pos,
1391 : cached_state, GFP_NOFS);
1392 16 : for (i = 0; i < num_pages; i++) {
1393 8 : unlock_page(pages[i]);
1394 8 : page_cache_release(pages[i]);
1395 : }
1396 8 : btrfs_start_ordered_extent(inode, ordered, 1);
1397 8 : btrfs_put_ordered_extent(ordered);
1398 8 : return -EAGAIN;
1399 : }
1400 35812 : if (ordered)
1401 0 : btrfs_put_ordered_extent(ordered);
1402 :
1403 35812 : clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos,
1404 : last_pos, EXTENT_DIRTY | EXTENT_DELALLOC |
1405 : EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
1406 : 0, 0, cached_state, GFP_NOFS);
1407 35812 : *lockstart = start_pos;
1408 35812 : *lockend = last_pos;
1409 : ret = 1;
1410 : }
1411 :
1412 1268388 : for (i = 0; i < num_pages; i++) {
1413 1150356 : if (clear_page_dirty_for_io(pages[i]))
1414 31901 : account_page_redirty(pages[i]);
1415 1150362 : set_page_extent_mapped(pages[i]);
1416 2300720 : WARN_ON(!PageLocked(pages[i]));
1417 : }
1418 :
1419 : return ret;
1420 : }
1421 :
1422 0 : static noinline int check_can_nocow(struct inode *inode, loff_t pos,
1423 : size_t *write_bytes)
1424 : {
1425 0 : struct btrfs_root *root = BTRFS_I(inode)->root;
1426 : struct btrfs_ordered_extent *ordered;
1427 : u64 lockstart, lockend;
1428 : u64 num_bytes;
1429 : int ret;
1430 :
1431 0 : ret = btrfs_start_nocow_write(root);
1432 0 : if (!ret)
1433 : return -ENOSPC;
1434 :
1435 0 : lockstart = round_down(pos, root->sectorsize);
1436 0 : lockend = round_up(pos + *write_bytes, root->sectorsize) - 1;
1437 :
1438 : while (1) {
1439 0 : lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
1440 0 : ordered = btrfs_lookup_ordered_range(inode, lockstart,
1441 0 : lockend - lockstart + 1);
1442 0 : if (!ordered) {
1443 : break;
1444 : }
1445 0 : unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
1446 0 : btrfs_start_ordered_extent(inode, ordered, 1);
1447 0 : btrfs_put_ordered_extent(ordered);
1448 0 : }
1449 :
1450 0 : num_bytes = lockend - lockstart + 1;
1451 0 : ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL);
1452 0 : if (ret <= 0) {
1453 : ret = 0;
1454 0 : btrfs_end_nocow_write(root);
1455 : } else {
1456 0 : *write_bytes = min_t(size_t, *write_bytes ,
1457 : num_bytes - pos + lockstart);
1458 : }
1459 :
1460 0 : unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
1461 :
1462 0 : return ret;
1463 : }
1464 :
1465 114088 : static noinline ssize_t __btrfs_buffered_write(struct file *file,
1466 346204 : struct iov_iter *i,
1467 : loff_t pos)
1468 : {
1469 : struct inode *inode = file_inode(file);
1470 114088 : struct btrfs_root *root = BTRFS_I(inode)->root;
1471 : struct page **pages = NULL;
1472 114088 : struct extent_state *cached_state = NULL;
1473 : u64 release_bytes = 0;
1474 : u64 lockstart;
1475 : u64 lockend;
1476 : unsigned long first_index;
1477 : size_t num_written = 0;
1478 : int nrptrs;
1479 : int ret = 0;
1480 : bool only_release_metadata = false;
1481 : bool force_page_uptodate = false;
1482 : bool need_unlock;
1483 :
1484 114088 : nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
1485 : PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
1486 : (sizeof(struct page *)));
1487 228176 : nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
1488 114088 : nrptrs = max(nrptrs, 8);
1489 114088 : pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
1490 114088 : if (!pages)
1491 : return -ENOMEM;
1492 :
1493 : first_index = pos >> PAGE_CACHE_SHIFT;
1494 :
1495 232116 : while (iov_iter_count(i) > 0) {
1496 118029 : size_t offset = pos & (PAGE_CACHE_SIZE - 1);
1497 118029 : size_t write_bytes = min(iov_iter_count(i),
1498 : nrptrs * (size_t)PAGE_CACHE_SIZE -
1499 : offset);
1500 236058 : size_t num_pages = (write_bytes + offset +
1501 118029 : PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1502 : size_t reserve_bytes;
1503 : size_t dirty_pages;
1504 : size_t copied;
1505 :
1506 118029 : WARN_ON(num_pages > nrptrs);
1507 :
1508 : /*
1509 : * Fault pages before locking them in prepare_pages
1510 : * to avoid recursive lock
1511 : */
1512 118029 : if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) {
1513 : ret = -EFAULT;
1514 2 : break;
1515 : }
1516 :
1517 118029 : reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
1518 118029 : ret = btrfs_check_data_free_space(inode, reserve_bytes);
1519 118030 : if (ret == -ENOSPC &&
1520 0 : (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
1521 : BTRFS_INODE_PREALLOC))) {
1522 0 : ret = check_can_nocow(inode, pos, &write_bytes);
1523 0 : if (ret > 0) {
1524 : only_release_metadata = true;
1525 : /*
1526 : * our prealloc extent may be smaller than
1527 : * write_bytes, so scale down.
1528 : */
1529 0 : num_pages = (write_bytes + offset +
1530 0 : PAGE_CACHE_SIZE - 1) >>
1531 : PAGE_CACHE_SHIFT;
1532 0 : reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
1533 : ret = 0;
1534 : } else {
1535 : ret = -ENOSPC;
1536 : }
1537 : }
1538 :
1539 118030 : if (ret)
1540 : break;
1541 :
1542 118030 : ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes);
1543 118030 : if (ret) {
1544 2 : if (!only_release_metadata)
1545 2 : btrfs_free_reserved_data_space(inode,
1546 : reserve_bytes);
1547 : else
1548 0 : btrfs_end_nocow_write(root);
1549 : break;
1550 : }
1551 :
1552 : release_bytes = reserve_bytes;
1553 : need_unlock = false;
1554 : again:
1555 : /*
1556 : * This is going to setup the pages array with the number of
1557 : * pages we want, so we don't really need to worry about the
1558 : * contents of pages from loop to loop
1559 : */
1560 118036 : ret = prepare_pages(inode, pages, num_pages,
1561 : pos, write_bytes,
1562 : force_page_uptodate);
1563 118036 : if (ret)
1564 : break;
1565 :
1566 118036 : ret = lock_and_cleanup_extent_if_need(inode, pages, num_pages,
1567 : pos, &lockstart, &lockend,
1568 : &cached_state);
1569 118036 : if (ret < 0) {
1570 8 : if (ret == -EAGAIN)
1571 : goto again;
1572 : break;
1573 118028 : } else if (ret > 0) {
1574 : need_unlock = true;
1575 : ret = 0;
1576 : }
1577 :
1578 118028 : copied = btrfs_copy_from_user(pos, num_pages,
1579 : write_bytes, pages, i);
1580 :
1581 : /*
1582 : * if we have trouble faulting in the pages, fall
1583 : * back to one page at a time
1584 : */
1585 118028 : if (copied < write_bytes)
1586 : nrptrs = 1;
1587 :
1588 118028 : if (copied == 0) {
1589 : force_page_uptodate = true;
1590 : dirty_pages = 0;
1591 : } else {
1592 : force_page_uptodate = false;
1593 236056 : dirty_pages = (copied + offset +
1594 118028 : PAGE_CACHE_SIZE - 1) >>
1595 : PAGE_CACHE_SHIFT;
1596 : }
1597 :
1598 : /*
1599 : * If we had a short copy we need to release the excess delaloc
1600 : * bytes we reserved. We need to increment outstanding_extents
1601 : * because btrfs_delalloc_release_space will decrement it, but
1602 : * we still have an outstanding extent for the chunk we actually
1603 : * managed to copy.
1604 : */
1605 118028 : if (num_pages > dirty_pages) {
1606 0 : release_bytes = (num_pages - dirty_pages) <<
1607 : PAGE_CACHE_SHIFT;
1608 0 : if (copied > 0) {
1609 : spin_lock(&BTRFS_I(inode)->lock);
1610 0 : BTRFS_I(inode)->outstanding_extents++;
1611 : spin_unlock(&BTRFS_I(inode)->lock);
1612 : }
1613 0 : if (only_release_metadata)
1614 0 : btrfs_delalloc_release_metadata(inode,
1615 : release_bytes);
1616 : else
1617 0 : btrfs_delalloc_release_space(inode,
1618 : release_bytes);
1619 : }
1620 :
1621 118028 : release_bytes = dirty_pages << PAGE_CACHE_SHIFT;
1622 :
1623 118028 : if (copied > 0)
1624 118028 : ret = btrfs_dirty_pages(root, inode, pages,
1625 : dirty_pages, pos, copied,
1626 : NULL);
1627 118028 : if (need_unlock)
1628 35812 : unlock_extent_cached(&BTRFS_I(inode)->io_tree,
1629 : lockstart, lockend, &cached_state,
1630 : GFP_NOFS);
1631 118028 : if (ret) {
1632 0 : btrfs_drop_pages(pages, num_pages);
1633 0 : break;
1634 : }
1635 :
1636 : release_bytes = 0;
1637 118028 : if (only_release_metadata)
1638 0 : btrfs_end_nocow_write(root);
1639 :
1640 118028 : if (only_release_metadata && copied > 0) {
1641 0 : u64 lockstart = round_down(pos, root->sectorsize);
1642 0 : u64 lockend = lockstart +
1643 : (dirty_pages << PAGE_CACHE_SHIFT) - 1;
1644 :
1645 0 : set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
1646 : lockend, EXTENT_NORESERVE, NULL,
1647 : NULL, GFP_NOFS);
1648 : only_release_metadata = false;
1649 : }
1650 :
1651 118028 : btrfs_drop_pages(pages, num_pages);
1652 :
1653 118028 : cond_resched();
1654 :
1655 118028 : balance_dirty_pages_ratelimited(inode->i_mapping);
1656 118028 : if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
1657 45643 : btrfs_btree_balance_dirty(root);
1658 :
1659 118028 : pos += copied;
1660 118028 : num_written += copied;
1661 : }
1662 :
1663 114089 : kfree(pages);
1664 :
1665 114088 : if (release_bytes) {
1666 0 : if (only_release_metadata) {
1667 0 : btrfs_end_nocow_write(root);
1668 0 : btrfs_delalloc_release_metadata(inode, release_bytes);
1669 : } else {
1670 0 : btrfs_delalloc_release_space(inode, release_bytes);
1671 : }
1672 : }
1673 :
1674 114088 : return num_written ? num_written : ret;
1675 : }
1676 :
1677 25270 : static ssize_t __btrfs_direct_write(struct kiocb *iocb,
1678 25259 : struct iov_iter *from,
1679 : loff_t pos)
1680 : {
1681 25270 : struct file *file = iocb->ki_filp;
1682 : ssize_t written;
1683 : ssize_t written_buffered;
1684 : loff_t endbyte;
1685 : int err;
1686 :
1687 25270 : written = generic_file_direct_write(iocb, from, pos);
1688 :
1689 50528 : if (written < 0 || !iov_iter_count(from))
1690 : return written;
1691 :
1692 0 : pos += written;
1693 0 : written_buffered = __btrfs_buffered_write(file, from, pos);
1694 0 : if (written_buffered < 0) {
1695 0 : err = written_buffered;
1696 0 : goto out;
1697 : }
1698 0 : endbyte = pos + written_buffered - 1;
1699 0 : err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
1700 0 : if (err)
1701 : goto out;
1702 0 : written += written_buffered;
1703 0 : iocb->ki_pos = pos + written_buffered;
1704 0 : invalidate_mapping_pages(file->f_mapping, pos >> PAGE_CACHE_SHIFT,
1705 0 : endbyte >> PAGE_CACHE_SHIFT);
1706 : out:
1707 0 : return written ? written : err;
1708 : }
1709 :
1710 139356 : static void update_time_for_write(struct inode *inode)
1711 : {
1712 : struct timespec now;
1713 :
1714 139356 : if (IS_NOCMTIME(inode))
1715 139358 : return;
1716 :
1717 139356 : now = current_fs_time(inode->i_sb);
1718 139355 : if (!timespec_equal(&inode->i_mtime, &now))
1719 58966 : inode->i_mtime = now;
1720 :
1721 139355 : if (!timespec_equal(&inode->i_ctime, &now))
1722 58890 : inode->i_ctime = now;
1723 :
1724 139355 : if (IS_I_VERSION(inode))
1725 : inode_inc_iversion(inode);
1726 : }
1727 :
1728 139379 : static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
1729 139379 : struct iov_iter *from)
1730 : {
1731 139379 : struct file *file = iocb->ki_filp;
1732 139356 : struct inode *inode = file_inode(file);
1733 139379 : struct btrfs_root *root = BTRFS_I(inode)->root;
1734 : u64 start_pos;
1735 : u64 end_pos;
1736 : ssize_t num_written = 0;
1737 : ssize_t err = 0;
1738 139379 : size_t count = iov_iter_count(from);
1739 139379 : bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host);
1740 139379 : loff_t pos = iocb->ki_pos;
1741 :
1742 139379 : mutex_lock(&inode->i_mutex);
1743 :
1744 139383 : current->backing_dev_info = inode->i_mapping->backing_dev_info;
1745 139383 : err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
1746 139382 : if (err) {
1747 0 : mutex_unlock(&inode->i_mutex);
1748 0 : goto out;
1749 : }
1750 :
1751 139382 : if (count == 0) {
1752 26 : mutex_unlock(&inode->i_mutex);
1753 26 : goto out;
1754 : }
1755 :
1756 : iov_iter_truncate(from, count);
1757 :
1758 139356 : err = file_remove_suid(file);
1759 139357 : if (err) {
1760 0 : mutex_unlock(&inode->i_mutex);
1761 0 : goto out;
1762 : }
1763 :
1764 : /*
1765 : * If BTRFS flips readonly due to some impossible error
1766 : * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR),
1767 : * although we have opened a file as writable, we have
1768 : * to stop this write operation to ensure FS consistency.
1769 : */
1770 278714 : if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) {
1771 0 : mutex_unlock(&inode->i_mutex);
1772 : err = -EROFS;
1773 0 : goto out;
1774 : }
1775 :
1776 : /*
1777 : * We reserve space for updating the inode when we reserve space for the
1778 : * extent we are going to write, so we will enospc out there. We don't
1779 : * need to start yet another transaction to update the inode as we will
1780 : * update the inode when we finish writing whatever data we write.
1781 : */
1782 139357 : update_time_for_write(inode);
1783 :
1784 139356 : start_pos = round_down(pos, root->sectorsize);
1785 139356 : if (start_pos > i_size_read(inode)) {
1786 : /* Expand hole size to cover write data, preventing empty gap */
1787 3075 : end_pos = round_up(pos + count, root->sectorsize);
1788 3075 : err = btrfs_cont_expand(inode, i_size_read(inode), end_pos);
1789 3075 : if (err) {
1790 0 : mutex_unlock(&inode->i_mutex);
1791 0 : goto out;
1792 : }
1793 : }
1794 :
1795 139356 : if (sync)
1796 1095 : atomic_inc(&BTRFS_I(inode)->sync_writers);
1797 :
1798 139358 : if (unlikely(file->f_flags & O_DIRECT)) {
1799 25270 : num_written = __btrfs_direct_write(iocb, from, pos);
1800 : } else {
1801 114088 : num_written = __btrfs_buffered_write(file, from, pos);
1802 114088 : if (num_written > 0)
1803 114087 : iocb->ki_pos = pos + num_written;
1804 : }
1805 :
1806 139357 : mutex_unlock(&inode->i_mutex);
1807 :
1808 : /*
1809 : * we want to make sure fsync finds this change
1810 : * but we haven't joined a transaction running right now.
1811 : *
1812 : * Later on, someone is sure to update the inode and get the
1813 : * real transid recorded.
1814 : *
1815 : * We set last_trans now to the fs_info generation + 1,
1816 : * this will either be one more than the running transaction
1817 : * or the generation used for the next transaction if there isn't
1818 : * one running right now.
1819 : *
1820 : * We also have to set last_sub_trans to the current log transid,
1821 : * otherwise subsequent syncs to a file that's been synced in this
1822 : * transaction will appear to have already occured.
1823 : */
1824 139358 : BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
1825 139358 : BTRFS_I(inode)->last_sub_trans = root->log_transid;
1826 139358 : if (num_written > 0) {
1827 139347 : err = generic_write_sync(file, pos, num_written);
1828 139347 : if (err < 0)
1829 : num_written = err;
1830 : }
1831 :
1832 139358 : if (sync)
1833 1095 : atomic_dec(&BTRFS_I(inode)->sync_writers);
1834 : out:
1835 139383 : current->backing_dev_info = NULL;
1836 139383 : return num_written ? num_written : err;
1837 : }
1838 :
1839 679049 : int btrfs_release_file(struct inode *inode, struct file *filp)
1840 : {
1841 679049 : if (filp->private_data)
1842 0 : btrfs_ioctl_trans_end(filp);
1843 : /*
1844 : * ordered_data_close is set by settattr when we are about to truncate
1845 : * a file from a non-zero size to a zero size. This tries to
1846 : * flush down new bytes that may have been written if the
1847 : * application were using truncate to replace a file in place.
1848 : */
1849 679078 : if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
1850 679049 : &BTRFS_I(inode)->runtime_flags))
1851 476 : filemap_flush(inode->i_mapping);
1852 679078 : return 0;
1853 : }
1854 :
1855 : /*
1856 : * fsync call for both files and directories. This logs the inode into
1857 : * the tree log instead of forcing full commits whenever possible.
1858 : *
1859 : * It needs to call filemap_fdatawait so that all ordered extent updates are
1860 : * in the metadata btree are up to date for copying to the log.
1861 : *
1862 : * It drops the inode mutex before doing the tree log commit. This is an
1863 : * important optimization for directories because holding the mutex prevents
1864 : * new operations on the dir while we write to disk.
1865 : */
1866 2498 : int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1867 : {
1868 2498 : struct dentry *dentry = file->f_path.dentry;
1869 2498 : struct inode *inode = dentry->d_inode;
1870 2498 : struct btrfs_root *root = BTRFS_I(inode)->root;
1871 : struct btrfs_trans_handle *trans;
1872 : struct btrfs_log_ctx ctx;
1873 : int ret = 0;
1874 : bool full_sync = 0;
1875 :
1876 2498 : trace_btrfs_sync_file(file, datasync);
1877 :
1878 : /*
1879 : * We write the dirty pages in the range and wait until they complete
1880 : * out of the ->i_mutex. If so, we can flush the dirty pages by
1881 : * multi-task, and make the performance up. See
1882 : * btrfs_wait_ordered_range for an explanation of the ASYNC check.
1883 : */
1884 2498 : atomic_inc(&BTRFS_I(inode)->sync_writers);
1885 2498 : ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
1886 4996 : if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1887 : &BTRFS_I(inode)->runtime_flags))
1888 47 : ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
1889 : atomic_dec(&BTRFS_I(inode)->sync_writers);
1890 2498 : if (ret)
1891 : return ret;
1892 :
1893 2498 : mutex_lock(&inode->i_mutex);
1894 :
1895 : /*
1896 : * We flush the dirty pages again to avoid some dirty pages in the
1897 : * range being left.
1898 : */
1899 2498 : atomic_inc(&root->log_batch);
1900 : full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
1901 : &BTRFS_I(inode)->runtime_flags);
1902 2498 : if (full_sync) {
1903 989 : ret = btrfs_wait_ordered_range(inode, start, end - start + 1);
1904 989 : if (ret) {
1905 0 : mutex_unlock(&inode->i_mutex);
1906 0 : goto out;
1907 : }
1908 : }
1909 : atomic_inc(&root->log_batch);
1910 :
1911 : /*
1912 : * check the transaction that last modified this inode
1913 : * and see if its already been committed
1914 : */
1915 2498 : if (!BTRFS_I(inode)->last_trans) {
1916 88 : mutex_unlock(&inode->i_mutex);
1917 88 : goto out;
1918 : }
1919 :
1920 : /*
1921 : * if the last transaction that changed this file was before
1922 : * the current transaction, we can bail out now without any
1923 : * syncing
1924 : */
1925 2410 : smp_mb();
1926 7216 : if (btrfs_inode_in_log(inode, root->fs_info->generation) ||
1927 2396 : BTRFS_I(inode)->last_trans <=
1928 2396 : root->fs_info->last_trans_committed) {
1929 841 : BTRFS_I(inode)->last_trans = 0;
1930 :
1931 : /*
1932 : * We'v had everything committed since the last time we were
1933 : * modified so clear this flag in case it was set for whatever
1934 : * reason, it's no longer relevant.
1935 : */
1936 : clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
1937 : &BTRFS_I(inode)->runtime_flags);
1938 841 : mutex_unlock(&inode->i_mutex);
1939 841 : goto out;
1940 : }
1941 :
1942 : /*
1943 : * ok we haven't committed the transaction yet, lets do a commit
1944 : */
1945 1569 : if (file->private_data)
1946 0 : btrfs_ioctl_trans_end(file);
1947 :
1948 : /*
1949 : * We use start here because we will need to wait on the IO to complete
1950 : * in btrfs_sync_log, which could require joining a transaction (for
1951 : * example checking cross references in the nocow path). If we use join
1952 : * here we could get into a situation where we're waiting on IO to
1953 : * happen that is blocked on a transaction trying to commit. With start
1954 : * we inc the extwriter counter, so we wait for all extwriters to exit
1955 : * before we start blocking join'ers. This comment is to keep somebody
1956 : * from thinking they are super smart and changing this to
1957 : * btrfs_join_transaction *cough*Josef*cough*.
1958 : */
1959 1569 : trans = btrfs_start_transaction(root, 0);
1960 1569 : if (IS_ERR(trans)) {
1961 0 : ret = PTR_ERR(trans);
1962 0 : mutex_unlock(&inode->i_mutex);
1963 0 : goto out;
1964 : }
1965 1569 : trans->sync = true;
1966 :
1967 : btrfs_init_log_ctx(&ctx);
1968 :
1969 1569 : ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, &ctx);
1970 1569 : if (ret < 0) {
1971 : /* Fallthrough and commit/free transaction. */
1972 : ret = 1;
1973 : }
1974 :
1975 : /* we've logged all the items and now have a consistent
1976 : * version of the file in the log. It is possible that
1977 : * someone will come in and modify the file, but that's
1978 : * fine because the log is consistent on disk, and we
1979 : * have references to all of the file's extents
1980 : *
1981 : * It is possible that someone will come in and log the
1982 : * file again, but that will end up using the synchronization
1983 : * inside btrfs_sync_log to keep things safe.
1984 : */
1985 1569 : mutex_unlock(&inode->i_mutex);
1986 :
1987 1569 : if (ret != BTRFS_NO_LOG_SYNC) {
1988 1569 : if (!ret) {
1989 1483 : ret = btrfs_sync_log(trans, root, &ctx);
1990 1483 : if (!ret) {
1991 1481 : ret = btrfs_end_transaction(trans, root);
1992 1481 : goto out;
1993 : }
1994 : }
1995 88 : if (!full_sync) {
1996 10 : ret = btrfs_wait_ordered_range(inode, start,
1997 10 : end - start + 1);
1998 10 : if (ret) {
1999 0 : btrfs_end_transaction(trans, root);
2000 0 : goto out;
2001 : }
2002 : }
2003 88 : ret = btrfs_commit_transaction(trans, root);
2004 : } else {
2005 0 : ret = btrfs_end_transaction(trans, root);
2006 : }
2007 : out:
2008 2498 : return ret > 0 ? -EIO : ret;
2009 : }
2010 :
2011 : static const struct vm_operations_struct btrfs_file_vm_ops = {
2012 : .fault = filemap_fault,
2013 : .map_pages = filemap_map_pages,
2014 : .page_mkwrite = btrfs_page_mkwrite,
2015 : .remap_pages = generic_file_remap_pages,
2016 : };
2017 :
2018 1214001 : static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
2019 : {
2020 1214001 : struct address_space *mapping = filp->f_mapping;
2021 :
2022 1214001 : if (!mapping->a_ops->readpage)
2023 : return -ENOEXEC;
2024 :
2025 : file_accessed(filp);
2026 1214058 : vma->vm_ops = &btrfs_file_vm_ops;
2027 :
2028 1214058 : return 0;
2029 : }
2030 :
2031 192 : static int hole_mergeable(struct inode *inode, struct extent_buffer *leaf,
2032 : int slot, u64 start, u64 end)
2033 : {
2034 : struct btrfs_file_extent_item *fi;
2035 : struct btrfs_key key;
2036 :
2037 192 : if (slot < 0 || slot >= btrfs_header_nritems(leaf))
2038 : return 0;
2039 :
2040 96 : btrfs_item_key_to_cpu(leaf, &key, slot);
2041 278 : if (key.objectid != btrfs_ino(inode) ||
2042 86 : key.type != BTRFS_EXTENT_DATA_KEY)
2043 : return 0;
2044 :
2045 86 : fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
2046 :
2047 86 : if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
2048 : return 0;
2049 :
2050 65 : if (btrfs_file_extent_disk_bytenr(leaf, fi))
2051 : return 0;
2052 :
2053 56 : if (key.offset == end)
2054 : return 1;
2055 50 : if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start)
2056 : return 1;
2057 0 : return 0;
2058 : }
2059 :
2060 73 : static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode,
2061 : struct btrfs_path *path, u64 offset, u64 end)
2062 : {
2063 73 : struct btrfs_root *root = BTRFS_I(inode)->root;
2064 : struct extent_buffer *leaf;
2065 : struct btrfs_file_extent_item *fi;
2066 : struct extent_map *hole_em;
2067 73 : struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
2068 : struct btrfs_key key;
2069 : int ret;
2070 :
2071 146 : if (btrfs_fs_incompat(root->fs_info, NO_HOLES))
2072 : goto out;
2073 :
2074 73 : key.objectid = btrfs_ino(inode);
2075 73 : key.type = BTRFS_EXTENT_DATA_KEY;
2076 73 : key.offset = offset;
2077 :
2078 73 : ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2079 73 : if (ret < 0)
2080 : return ret;
2081 73 : BUG_ON(!ret);
2082 :
2083 73 : leaf = path->nodes[0];
2084 73 : if (hole_mergeable(inode, leaf, path->slots[0]-1, offset, end)) {
2085 : u64 num_bytes;
2086 :
2087 50 : path->slots[0]--;
2088 50 : fi = btrfs_item_ptr(leaf, path->slots[0],
2089 : struct btrfs_file_extent_item);
2090 50 : num_bytes = btrfs_file_extent_num_bytes(leaf, fi) +
2091 : end - offset;
2092 : btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2093 : btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
2094 : btrfs_set_file_extent_offset(leaf, fi, 0);
2095 50 : btrfs_mark_buffer_dirty(leaf);
2096 50 : goto out;
2097 : }
2098 :
2099 23 : if (hole_mergeable(inode, leaf, path->slots[0], offset, end)) {
2100 : u64 num_bytes;
2101 :
2102 6 : key.offset = offset;
2103 6 : btrfs_set_item_key_safe(root, path, &key);
2104 12 : fi = btrfs_item_ptr(leaf, path->slots[0],
2105 : struct btrfs_file_extent_item);
2106 6 : num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
2107 : offset;
2108 : btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2109 : btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
2110 : btrfs_set_file_extent_offset(leaf, fi, 0);
2111 6 : btrfs_mark_buffer_dirty(leaf);
2112 6 : goto out;
2113 : }
2114 17 : btrfs_release_path(path);
2115 :
2116 34 : ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset,
2117 : 0, 0, end - offset, 0, end - offset,
2118 : 0, 0, 0);
2119 17 : if (ret)
2120 : return ret;
2121 :
2122 : out:
2123 73 : btrfs_release_path(path);
2124 :
2125 73 : hole_em = alloc_extent_map();
2126 73 : if (!hole_em) {
2127 0 : btrfs_drop_extent_cache(inode, offset, end - 1, 0);
2128 : set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
2129 : &BTRFS_I(inode)->runtime_flags);
2130 : } else {
2131 73 : hole_em->start = offset;
2132 73 : hole_em->len = end - offset;
2133 73 : hole_em->ram_bytes = hole_em->len;
2134 73 : hole_em->orig_start = offset;
2135 :
2136 73 : hole_em->block_start = EXTENT_MAP_HOLE;
2137 73 : hole_em->block_len = 0;
2138 73 : hole_em->orig_block_len = 0;
2139 73 : hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
2140 73 : hole_em->compress_type = BTRFS_COMPRESS_NONE;
2141 73 : hole_em->generation = trans->transid;
2142 :
2143 : do {
2144 73 : btrfs_drop_extent_cache(inode, offset, end - 1, 0);
2145 73 : write_lock(&em_tree->lock);
2146 73 : ret = add_extent_mapping(em_tree, hole_em, 1);
2147 : write_unlock(&em_tree->lock);
2148 73 : } while (ret == -EEXIST);
2149 73 : free_extent_map(hole_em);
2150 73 : if (ret)
2151 : set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
2152 : &BTRFS_I(inode)->runtime_flags);
2153 : }
2154 :
2155 : return 0;
2156 : }
2157 :
2158 : /*
2159 : * Find a hole extent on given inode and change start/len to the end of hole
2160 : * extent.(hole/vacuum extent whose em->start <= start &&
2161 : * em->start + em->len > start)
2162 : * When a hole extent is found, return 1 and modify start/len.
2163 : */
2164 514 : static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len)
2165 : {
2166 : struct extent_map *em;
2167 : int ret = 0;
2168 :
2169 514 : em = btrfs_get_extent(inode, NULL, 0, *start, *len, 0);
2170 514 : if (IS_ERR_OR_NULL(em)) {
2171 0 : if (!em)
2172 : ret = -ENOMEM;
2173 : else
2174 0 : ret = PTR_ERR(em);
2175 0 : return ret;
2176 : }
2177 :
2178 : /* Hole or vacuum extent(only exists in no-hole mode) */
2179 514 : if (em->block_start == EXTENT_MAP_HOLE) {
2180 : ret = 1;
2181 788 : *len = em->start + em->len > *start + *len ?
2182 394 : 0 : *start + *len - em->start - em->len;
2183 394 : *start = em->start + em->len;
2184 : }
2185 514 : free_extent_map(em);
2186 514 : return ret;
2187 : }
2188 :
2189 359 : static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2190 : {
2191 830 : struct btrfs_root *root = BTRFS_I(inode)->root;
2192 359 : struct extent_state *cached_state = NULL;
2193 : struct btrfs_path *path;
2194 : struct btrfs_block_rsv *rsv;
2195 : struct btrfs_trans_handle *trans;
2196 : u64 lockstart;
2197 : u64 lockend;
2198 : u64 tail_start;
2199 : u64 tail_len;
2200 359 : u64 orig_start = offset;
2201 : u64 cur_offset;
2202 : u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
2203 : u64 drop_end;
2204 : int ret = 0;
2205 : int err = 0;
2206 : int rsv_count;
2207 : bool same_page;
2208 359 : bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
2209 : u64 ino_size;
2210 :
2211 359 : ret = btrfs_wait_ordered_range(inode, offset, len);
2212 359 : if (ret)
2213 : return ret;
2214 :
2215 359 : mutex_lock(&inode->i_mutex);
2216 359 : ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE);
2217 359 : ret = find_first_non_hole(inode, &offset, &len);
2218 359 : if (ret < 0)
2219 : goto out_only_mutex;
2220 359 : if (ret && !len) {
2221 : /* Already in a large hole */
2222 : ret = 0;
2223 : goto out_only_mutex;
2224 : }
2225 :
2226 113 : lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize);
2227 226 : lockend = round_down(offset + len,
2228 113 : BTRFS_I(inode)->root->sectorsize) - 1;
2229 113 : same_page = ((offset >> PAGE_CACHE_SHIFT) ==
2230 113 : ((offset + len - 1) >> PAGE_CACHE_SHIFT));
2231 :
2232 : /*
2233 : * We needn't truncate any page which is beyond the end of the file
2234 : * because we are sure there is no data there.
2235 : */
2236 : /*
2237 : * Only do this if we are in the same page and we aren't doing the
2238 : * entire page.
2239 : */
2240 113 : if (same_page && len < PAGE_CACHE_SIZE) {
2241 1 : if (offset < ino_size)
2242 0 : ret = btrfs_truncate_page(inode, offset, len, 0);
2243 : goto out_only_mutex;
2244 : }
2245 :
2246 : /* zero back part of the first page */
2247 112 : if (offset < ino_size) {
2248 74 : ret = btrfs_truncate_page(inode, offset, 0, 0);
2249 74 : if (ret) {
2250 0 : mutex_unlock(&inode->i_mutex);
2251 0 : return ret;
2252 : }
2253 : }
2254 :
2255 : /* Check the aligned pages after the first unaligned page,
2256 : * if offset != orig_start, which means the first unaligned page
2257 : * including serveral following pages are already in holes,
2258 : * the extra check can be skipped */
2259 112 : if (offset == orig_start) {
2260 : /* after truncate page, check hole again */
2261 44 : len = offset + len - lockstart;
2262 44 : offset = lockstart;
2263 44 : ret = find_first_non_hole(inode, &offset, &len);
2264 44 : if (ret < 0)
2265 : goto out_only_mutex;
2266 44 : if (ret && !len) {
2267 : ret = 0;
2268 : goto out_only_mutex;
2269 : }
2270 44 : lockstart = offset;
2271 : }
2272 :
2273 : /* Check the tail unaligned part is in a hole */
2274 112 : tail_start = lockend + 1;
2275 112 : tail_len = offset + len - tail_start;
2276 112 : if (tail_len) {
2277 111 : ret = find_first_non_hole(inode, &tail_start, &tail_len);
2278 111 : if (unlikely(ret < 0))
2279 : goto out_only_mutex;
2280 111 : if (!ret) {
2281 : /* zero the front end of the last page */
2282 34 : if (tail_start + tail_len < ino_size) {
2283 18 : ret = btrfs_truncate_page(inode,
2284 : tail_start + tail_len, 0, 1);
2285 18 : if (ret)
2286 : goto out_only_mutex;
2287 : }
2288 : }
2289 : }
2290 :
2291 112 : if (lockend < lockstart) {
2292 0 : mutex_unlock(&inode->i_mutex);
2293 0 : return 0;
2294 : }
2295 :
2296 : while (1) {
2297 : struct btrfs_ordered_extent *ordered;
2298 :
2299 112 : truncate_pagecache_range(inode, lockstart, lockend);
2300 :
2301 112 : lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2302 : 0, &cached_state);
2303 112 : ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
2304 :
2305 : /*
2306 : * We need to make sure we have no ordered extents in this range
2307 : * and nobody raced in and read a page in this range, if we did
2308 : * we need to try again.
2309 : */
2310 112 : if ((!ordered ||
2311 0 : (ordered->file_offset + ordered->len <= lockstart ||
2312 112 : ordered->file_offset > lockend)) &&
2313 112 : !btrfs_page_exists_in_range(inode, lockstart, lockend)) {
2314 112 : if (ordered)
2315 0 : btrfs_put_ordered_extent(ordered);
2316 : break;
2317 : }
2318 0 : if (ordered)
2319 0 : btrfs_put_ordered_extent(ordered);
2320 0 : unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
2321 : lockend, &cached_state, GFP_NOFS);
2322 0 : ret = btrfs_wait_ordered_range(inode, lockstart,
2323 0 : lockend - lockstart + 1);
2324 0 : if (ret) {
2325 0 : mutex_unlock(&inode->i_mutex);
2326 0 : return ret;
2327 : }
2328 : }
2329 :
2330 112 : path = btrfs_alloc_path();
2331 112 : if (!path) {
2332 : ret = -ENOMEM;
2333 : goto out;
2334 : }
2335 :
2336 112 : rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
2337 112 : if (!rsv) {
2338 : ret = -ENOMEM;
2339 : goto out_free;
2340 : }
2341 112 : rsv->size = btrfs_calc_trunc_metadata_size(root, 1);
2342 112 : rsv->failfast = 1;
2343 :
2344 : /*
2345 : * 1 - update the inode
2346 : * 1 - removing the extents in the range
2347 : * 1 - adding the hole extent if no_holes isn't set
2348 : */
2349 112 : rsv_count = no_holes ? 2 : 3;
2350 112 : trans = btrfs_start_transaction(root, rsv_count);
2351 112 : if (IS_ERR(trans)) {
2352 0 : err = PTR_ERR(trans);
2353 0 : goto out_free;
2354 : }
2355 :
2356 112 : ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
2357 : min_size);
2358 112 : BUG_ON(ret);
2359 112 : trans->block_rsv = rsv;
2360 :
2361 112 : cur_offset = lockstart;
2362 112 : len = lockend - cur_offset;
2363 224 : while (cur_offset < lockend) {
2364 112 : ret = __btrfs_drop_extents(trans, root, inode, path,
2365 : cur_offset, lockend + 1,
2366 : &drop_end, 1, 0, 0, NULL);
2367 112 : if (ret != -ENOSPC)
2368 : break;
2369 :
2370 0 : trans->block_rsv = &root->fs_info->trans_block_rsv;
2371 :
2372 0 : if (cur_offset < ino_size) {
2373 0 : ret = fill_holes(trans, inode, path, cur_offset,
2374 : drop_end);
2375 0 : if (ret) {
2376 : err = ret;
2377 : break;
2378 : }
2379 : }
2380 :
2381 0 : cur_offset = drop_end;
2382 :
2383 0 : ret = btrfs_update_inode(trans, root, inode);
2384 0 : if (ret) {
2385 : err = ret;
2386 : break;
2387 : }
2388 :
2389 0 : btrfs_end_transaction(trans, root);
2390 0 : btrfs_btree_balance_dirty(root);
2391 :
2392 0 : trans = btrfs_start_transaction(root, rsv_count);
2393 0 : if (IS_ERR(trans)) {
2394 0 : ret = PTR_ERR(trans);
2395 : trans = NULL;
2396 0 : break;
2397 : }
2398 :
2399 0 : ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
2400 : rsv, min_size);
2401 0 : BUG_ON(ret); /* shouldn't happen */
2402 0 : trans->block_rsv = rsv;
2403 :
2404 0 : ret = find_first_non_hole(inode, &cur_offset, &len);
2405 0 : if (unlikely(ret < 0))
2406 : break;
2407 0 : if (ret && !len) {
2408 : ret = 0;
2409 : break;
2410 : }
2411 : }
2412 :
2413 112 : if (ret) {
2414 : err = ret;
2415 : goto out_trans;
2416 : }
2417 :
2418 112 : trans->block_rsv = &root->fs_info->trans_block_rsv;
2419 : /*
2420 : * Don't insert file hole extent item if it's for a range beyond eof
2421 : * (because it's useless) or if it represents a 0 bytes range (when
2422 : * cur_offset == drop_end).
2423 : */
2424 112 : if (cur_offset < ino_size && cur_offset < drop_end) {
2425 73 : ret = fill_holes(trans, inode, path, cur_offset, drop_end);
2426 73 : if (ret) {
2427 : err = ret;
2428 0 : goto out_trans;
2429 : }
2430 : }
2431 :
2432 : out_trans:
2433 112 : if (!trans)
2434 : goto out_free;
2435 :
2436 : inode_inc_iversion(inode);
2437 112 : inode->i_mtime = inode->i_ctime = CURRENT_TIME;
2438 :
2439 112 : trans->block_rsv = &root->fs_info->trans_block_rsv;
2440 112 : ret = btrfs_update_inode(trans, root, inode);
2441 112 : btrfs_end_transaction(trans, root);
2442 112 : btrfs_btree_balance_dirty(root);
2443 : out_free:
2444 112 : btrfs_free_path(path);
2445 112 : btrfs_free_block_rsv(root, rsv);
2446 : out:
2447 112 : unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2448 : &cached_state, GFP_NOFS);
2449 : out_only_mutex:
2450 359 : mutex_unlock(&inode->i_mutex);
2451 359 : if (ret && !err)
2452 : err = ret;
2453 359 : return err;
2454 : }
2455 :
2456 3730 : static long btrfs_fallocate(struct file *file, int mode,
2457 : loff_t offset, loff_t len)
2458 : {
2459 : struct inode *inode = file_inode(file);
2460 3730 : struct extent_state *cached_state = NULL;
2461 3730 : struct btrfs_root *root = BTRFS_I(inode)->root;
2462 : u64 cur_offset;
2463 : u64 last_byte;
2464 : u64 alloc_start;
2465 : u64 alloc_end;
2466 3730 : u64 alloc_hint = 0;
2467 : u64 locked_end;
2468 225822 : struct extent_map *em;
2469 3730 : int blocksize = BTRFS_I(inode)->root->sectorsize;
2470 : int ret;
2471 :
2472 3730 : alloc_start = round_down(offset, blocksize);
2473 3730 : alloc_end = round_up(offset + len, blocksize);
2474 :
2475 : /* Make sure we aren't being give some crap mode */
2476 3730 : if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
2477 : return -EOPNOTSUPP;
2478 :
2479 2664 : if (mode & FALLOC_FL_PUNCH_HOLE)
2480 359 : return btrfs_punch_hole(inode, offset, len);
2481 :
2482 : /*
2483 : * Make sure we have enough space before we do the
2484 : * allocation.
2485 : */
2486 2305 : ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
2487 2305 : if (ret)
2488 0 : return ret;
2489 2305 : if (root->fs_info->quota_enabled) {
2490 194 : ret = btrfs_qgroup_reserve(root, alloc_end - alloc_start);
2491 194 : if (ret)
2492 : goto out_reserve_fail;
2493 : }
2494 :
2495 2305 : mutex_lock(&inode->i_mutex);
2496 2305 : ret = inode_newsize_ok(inode, alloc_end);
2497 2305 : if (ret)
2498 : goto out;
2499 :
2500 2305 : if (alloc_start > inode->i_size) {
2501 920 : ret = btrfs_cont_expand(inode, i_size_read(inode),
2502 : alloc_start);
2503 920 : if (ret)
2504 : goto out;
2505 : } else {
2506 : /*
2507 : * If we are fallocating from the end of the file onward we
2508 : * need to zero out the end of the page if i_size lands in the
2509 : * middle of a page.
2510 : */
2511 1385 : ret = btrfs_truncate_page(inode, inode->i_size, 0, 0);
2512 1385 : if (ret)
2513 : goto out;
2514 : }
2515 :
2516 : /*
2517 : * wait for ordered IO before we have any locks. We'll loop again
2518 : * below with the locks held.
2519 : */
2520 2305 : ret = btrfs_wait_ordered_range(inode, alloc_start,
2521 : alloc_end - alloc_start);
2522 2305 : if (ret)
2523 : goto out;
2524 :
2525 2305 : locked_end = alloc_end - 1;
2526 : while (1) {
2527 : struct btrfs_ordered_extent *ordered;
2528 :
2529 : /* the extent lock is ordered inside the running
2530 : * transaction
2531 : */
2532 2305 : lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
2533 : locked_end, 0, &cached_state);
2534 2305 : ordered = btrfs_lookup_first_ordered_extent(inode,
2535 : alloc_end - 1);
2536 2308 : if (ordered &&
2537 3 : ordered->file_offset + ordered->len > alloc_start &&
2538 : ordered->file_offset < alloc_end) {
2539 0 : btrfs_put_ordered_extent(ordered);
2540 0 : unlock_extent_cached(&BTRFS_I(inode)->io_tree,
2541 : alloc_start, locked_end,
2542 : &cached_state, GFP_NOFS);
2543 : /*
2544 : * we can't wait on the range with the transaction
2545 : * running or with the extent lock held
2546 : */
2547 0 : ret = btrfs_wait_ordered_range(inode, alloc_start,
2548 : alloc_end - alloc_start);
2549 0 : if (ret)
2550 : goto out;
2551 : } else {
2552 2305 : if (ordered)
2553 3 : btrfs_put_ordered_extent(ordered);
2554 : break;
2555 : }
2556 : }
2557 :
2558 : cur_offset = alloc_start;
2559 : while (1) {
2560 : u64 actual_end;
2561 :
2562 225822 : em = btrfs_get_extent(inode, NULL, 0, cur_offset,
2563 : alloc_end - cur_offset, 0);
2564 225822 : if (IS_ERR_OR_NULL(em)) {
2565 0 : if (!em)
2566 : ret = -ENOMEM;
2567 : else
2568 0 : ret = PTR_ERR(em);
2569 : break;
2570 : }
2571 225822 : last_byte = min(extent_map_end(em), alloc_end);
2572 225822 : actual_end = min_t(u64, extent_map_end(em), offset + len);
2573 225822 : last_byte = ALIGN(last_byte, blocksize);
2574 :
2575 449058 : if (em->block_start == EXTENT_MAP_HOLE ||
2576 223374 : (cur_offset >= inode->i_size &&
2577 : !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
2578 2586 : ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
2579 : last_byte - cur_offset,
2580 2586 : 1 << inode->i_blkbits,
2581 : offset + len,
2582 : &alloc_hint);
2583 :
2584 2586 : if (ret < 0) {
2585 0 : free_extent_map(em);
2586 0 : break;
2587 : }
2588 223597 : } else if (actual_end > inode->i_size &&
2589 361 : !(mode & FALLOC_FL_KEEP_SIZE)) {
2590 : /*
2591 : * We didn't need to allocate any more space, but we
2592 : * still extended the size of the file so we need to
2593 : * update i_size.
2594 : */
2595 93 : inode->i_ctime = CURRENT_TIME;
2596 93 : i_size_write(inode, actual_end);
2597 93 : btrfs_ordered_update_i_size(inode, actual_end, NULL);
2598 : }
2599 225822 : free_extent_map(em);
2600 :
2601 : cur_offset = last_byte;
2602 225822 : if (cur_offset >= alloc_end) {
2603 : ret = 0;
2604 : break;
2605 : }
2606 : }
2607 2305 : unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
2608 : &cached_state, GFP_NOFS);
2609 : out:
2610 2305 : mutex_unlock(&inode->i_mutex);
2611 2305 : if (root->fs_info->quota_enabled)
2612 194 : btrfs_qgroup_free(root, alloc_end - alloc_start);
2613 : out_reserve_fail:
2614 : /* Let go of our reservation. */
2615 2305 : btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
2616 2305 : return ret;
2617 : }
2618 :
2619 1482 : static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
2620 : {
2621 741 : struct btrfs_root *root = BTRFS_I(inode)->root;
2622 : struct extent_map *em = NULL;
2623 741 : struct extent_state *cached_state = NULL;
2624 741 : u64 lockstart = *offset;
2625 741 : u64 lockend = i_size_read(inode);
2626 : u64 start = *offset;
2627 : u64 len = i_size_read(inode);
2628 : int ret = 0;
2629 :
2630 741 : lockend = max_t(u64, root->sectorsize, lockend);
2631 741 : if (lockend <= lockstart)
2632 0 : lockend = lockstart + root->sectorsize;
2633 :
2634 741 : lockend--;
2635 741 : len = lockend - lockstart + 1;
2636 :
2637 741 : len = max_t(u64, len, root->sectorsize);
2638 741 : if (inode->i_size == 0)
2639 : return -ENXIO;
2640 :
2641 741 : lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0,
2642 : &cached_state);
2643 :
2644 1540 : while (start < inode->i_size) {
2645 795 : em = btrfs_get_extent_fiemap(inode, NULL, 0, start, len, 0);
2646 795 : if (IS_ERR(em)) {
2647 0 : ret = PTR_ERR(em);
2648 : em = NULL;
2649 0 : break;
2650 : }
2651 :
2652 797 : if (whence == SEEK_HOLE &&
2653 3 : (em->block_start == EXTENT_MAP_HOLE ||
2654 : test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))
2655 : break;
2656 1587 : else if (whence == SEEK_DATA &&
2657 1540 : (em->block_start != EXTENT_MAP_HOLE &&
2658 : !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))
2659 : break;
2660 :
2661 58 : start = em->start + em->len;
2662 58 : free_extent_map(em);
2663 : em = NULL;
2664 58 : cond_resched();
2665 : }
2666 741 : free_extent_map(em);
2667 741 : if (!ret) {
2668 741 : if (whence == SEEK_DATA && start >= inode->i_size)
2669 : ret = -ENXIO;
2670 : else
2671 737 : *offset = min_t(loff_t, start, inode->i_size);
2672 : }
2673 741 : unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2674 : &cached_state, GFP_NOFS);
2675 741 : return ret;
2676 : }
2677 :
2678 13319 : static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
2679 : {
2680 14236 : struct inode *inode = file->f_mapping->host;
2681 : int ret;
2682 :
2683 13319 : mutex_lock(&inode->i_mutex);
2684 13319 : switch (whence) {
2685 : case SEEK_END:
2686 : case SEEK_CUR:
2687 7950 : offset = generic_file_llseek(file, offset, whence);
2688 7950 : goto out;
2689 : case SEEK_DATA:
2690 : case SEEK_HOLE:
2691 917 : if (offset >= i_size_read(inode)) {
2692 176 : mutex_unlock(&inode->i_mutex);
2693 176 : return -ENXIO;
2694 : }
2695 :
2696 741 : ret = find_desired_extent(inode, &offset, whence);
2697 741 : if (ret) {
2698 4 : mutex_unlock(&inode->i_mutex);
2699 4 : return ret;
2700 : }
2701 : }
2702 :
2703 5189 : offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
2704 : out:
2705 13139 : mutex_unlock(&inode->i_mutex);
2706 13139 : return offset;
2707 : }
2708 :
2709 : const struct file_operations btrfs_file_operations = {
2710 : .llseek = btrfs_file_llseek,
2711 : .read = new_sync_read,
2712 : .write = new_sync_write,
2713 : .read_iter = generic_file_read_iter,
2714 : .splice_read = generic_file_splice_read,
2715 : .write_iter = btrfs_file_write_iter,
2716 : .mmap = btrfs_file_mmap,
2717 : .open = generic_file_open,
2718 : .release = btrfs_release_file,
2719 : .fsync = btrfs_sync_file,
2720 : .fallocate = btrfs_fallocate,
2721 : .unlocked_ioctl = btrfs_ioctl,
2722 : #ifdef CONFIG_COMPAT
2723 : .compat_ioctl = btrfs_ioctl,
2724 : #endif
2725 : };
2726 :
2727 0 : void btrfs_auto_defrag_exit(void)
2728 : {
2729 0 : if (btrfs_inode_defrag_cachep)
2730 0 : kmem_cache_destroy(btrfs_inode_defrag_cachep);
2731 0 : }
2732 :
2733 0 : int btrfs_auto_defrag_init(void)
2734 : {
2735 0 : btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
2736 : sizeof(struct inode_defrag), 0,
2737 : SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
2738 : NULL);
2739 0 : if (!btrfs_inode_defrag_cachep)
2740 : return -ENOMEM;
2741 :
2742 0 : return 0;
2743 : }
|