Line data Source code
1 : /*
2 : * Copyright (C) 2007 Oracle. All rights reserved.
3 : *
4 : * This program is free software; you can redistribute it and/or
5 : * modify it under the terms of the GNU General Public
6 : * License v2 as published by the Free Software Foundation.
7 : *
8 : * This program is distributed in the hope that it will be useful,
9 : * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 : * General Public License for more details.
12 : *
13 : * You should have received a copy of the GNU General Public
14 : * License along with this program; if not, write to the
15 : * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 : * Boston, MA 021110-1307, USA.
17 : */
18 :
19 : #include <linux/kernel.h>
20 : #include <linux/bio.h>
21 : #include <linux/buffer_head.h>
22 : #include <linux/file.h>
23 : #include <linux/fs.h>
24 : #include <linux/pagemap.h>
25 : #include <linux/highmem.h>
26 : #include <linux/time.h>
27 : #include <linux/init.h>
28 : #include <linux/string.h>
29 : #include <linux/backing-dev.h>
30 : #include <linux/mpage.h>
31 : #include <linux/swap.h>
32 : #include <linux/writeback.h>
33 : #include <linux/statfs.h>
34 : #include <linux/compat.h>
35 : #include <linux/aio.h>
36 : #include <linux/bit_spinlock.h>
37 : #include <linux/xattr.h>
38 : #include <linux/posix_acl.h>
39 : #include <linux/falloc.h>
40 : #include <linux/slab.h>
41 : #include <linux/ratelimit.h>
42 : #include <linux/mount.h>
43 : #include <linux/btrfs.h>
44 : #include <linux/blkdev.h>
45 : #include <linux/posix_acl_xattr.h>
46 : #include "ctree.h"
47 : #include "disk-io.h"
48 : #include "transaction.h"
49 : #include "btrfs_inode.h"
50 : #include "print-tree.h"
51 : #include "ordered-data.h"
52 : #include "xattr.h"
53 : #include "tree-log.h"
54 : #include "volumes.h"
55 : #include "compression.h"
56 : #include "locking.h"
57 : #include "free-space-cache.h"
58 : #include "inode-map.h"
59 : #include "backref.h"
60 : #include "hash.h"
61 : #include "props.h"
62 :
63 : struct btrfs_iget_args {
64 : struct btrfs_key *location;
65 : struct btrfs_root *root;
66 : };
67 :
68 : static const struct inode_operations btrfs_dir_inode_operations;
69 : static const struct inode_operations btrfs_symlink_inode_operations;
70 : static const struct inode_operations btrfs_dir_ro_inode_operations;
71 : static const struct inode_operations btrfs_special_inode_operations;
72 : static const struct inode_operations btrfs_file_inode_operations;
73 : static const struct address_space_operations btrfs_aops;
74 : static const struct address_space_operations btrfs_symlink_aops;
75 : static const struct file_operations btrfs_dir_file_operations;
76 : static struct extent_io_ops btrfs_extent_io_ops;
77 :
78 : static struct kmem_cache *btrfs_inode_cachep;
79 : static struct kmem_cache *btrfs_delalloc_work_cachep;
80 : struct kmem_cache *btrfs_trans_handle_cachep;
81 : struct kmem_cache *btrfs_transaction_cachep;
82 : struct kmem_cache *btrfs_path_cachep;
83 : struct kmem_cache *btrfs_free_space_cachep;
84 :
85 : #define S_SHIFT 12
86 : static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
87 : [S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE,
88 : [S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR,
89 : [S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV,
90 : [S_IFBLK >> S_SHIFT] = BTRFS_FT_BLKDEV,
91 : [S_IFIFO >> S_SHIFT] = BTRFS_FT_FIFO,
92 : [S_IFSOCK >> S_SHIFT] = BTRFS_FT_SOCK,
93 : [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK,
94 : };
95 :
96 : static int btrfs_setsize(struct inode *inode, struct iattr *attr);
97 : static int btrfs_truncate(struct inode *inode);
98 : static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
99 : static noinline int cow_file_range(struct inode *inode,
100 : struct page *locked_page,
101 : u64 start, u64 end, int *page_started,
102 : unsigned long *nr_written, int unlock);
103 : static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
104 : u64 len, u64 orig_start,
105 : u64 block_start, u64 block_len,
106 : u64 orig_block_len, u64 ram_bytes,
107 : int type);
108 :
109 : static int btrfs_dirty_inode(struct inode *inode);
110 :
111 20423 : static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
112 : struct inode *inode, struct inode *dir,
113 : const struct qstr *qstr)
114 : {
115 : int err;
116 :
117 20423 : err = btrfs_init_acl(trans, inode, dir);
118 20423 : if (!err)
119 20423 : err = btrfs_xattr_security_init(trans, inode, dir, qstr);
120 20423 : return err;
121 : }
122 :
123 : /*
124 : * this does all the hard work for inserting an inline extent into
125 : * the btree. The caller should have done a btrfs_drop_extents so that
126 : * no overlapping inline items exist in the btree
127 : */
128 3851 : static int insert_inline_extent(struct btrfs_trans_handle *trans,
129 : struct btrfs_path *path, int extent_inserted,
130 : struct btrfs_root *root, struct inode *inode,
131 : u64 start, size_t size, size_t compressed_size,
132 : int compress_type,
133 : struct page **compressed_pages)
134 : {
135 : struct extent_buffer *leaf;
136 : struct page *page = NULL;
137 : char *kaddr;
138 : unsigned long ptr;
139 : struct btrfs_file_extent_item *ei;
140 : int err = 0;
141 : int ret;
142 : size_t cur_size = size;
143 : unsigned long offset;
144 :
145 3851 : if (compressed_size && compressed_pages)
146 : cur_size = compressed_size;
147 :
148 3851 : inode_add_bytes(inode, size);
149 :
150 3851 : if (!extent_inserted) {
151 : struct btrfs_key key;
152 : size_t datasize;
153 :
154 2586 : key.objectid = btrfs_ino(inode);
155 2586 : key.offset = start;
156 : btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
157 :
158 2586 : datasize = btrfs_file_extent_calc_inline_size(cur_size);
159 2586 : path->leave_spinning = 1;
160 : ret = btrfs_insert_empty_item(trans, root, path, &key,
161 : datasize);
162 2586 : if (ret) {
163 : err = ret;
164 0 : goto fail;
165 : }
166 : }
167 3851 : leaf = path->nodes[0];
168 7702 : ei = btrfs_item_ptr(leaf, path->slots[0],
169 : struct btrfs_file_extent_item);
170 3851 : btrfs_set_file_extent_generation(leaf, ei, trans->transid);
171 : btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
172 : btrfs_set_file_extent_encryption(leaf, ei, 0);
173 : btrfs_set_file_extent_other_encoding(leaf, ei, 0);
174 : btrfs_set_file_extent_ram_bytes(leaf, ei, size);
175 : ptr = btrfs_file_extent_inline_start(ei);
176 :
177 3851 : if (compress_type != BTRFS_COMPRESS_NONE) {
178 : struct page *cpage;
179 : int i = 0;
180 0 : while (compressed_size > 0) {
181 0 : cpage = compressed_pages[i];
182 0 : cur_size = min_t(unsigned long, compressed_size,
183 : PAGE_CACHE_SIZE);
184 :
185 : kaddr = kmap_atomic(cpage);
186 0 : write_extent_buffer(leaf, kaddr, ptr, cur_size);
187 : kunmap_atomic(kaddr);
188 :
189 0 : i++;
190 0 : ptr += cur_size;
191 0 : compressed_size -= cur_size;
192 : }
193 0 : btrfs_set_file_extent_compression(leaf, ei,
194 : compress_type);
195 : } else {
196 3851 : page = find_get_page(inode->i_mapping,
197 3851 : start >> PAGE_CACHE_SHIFT);
198 : btrfs_set_file_extent_compression(leaf, ei, 0);
199 : kaddr = kmap_atomic(page);
200 3851 : offset = start & (PAGE_CACHE_SIZE - 1);
201 3851 : write_extent_buffer(leaf, kaddr + offset, ptr, size);
202 : kunmap_atomic(kaddr);
203 3851 : page_cache_release(page);
204 : }
205 3851 : btrfs_mark_buffer_dirty(leaf);
206 3851 : btrfs_release_path(path);
207 :
208 : /*
209 : * we're an inline extent, so nobody can
210 : * extend the file past i_size without locking
211 : * a page we already have locked.
212 : *
213 : * We must do any isize and inode updates
214 : * before we unlock the pages. Otherwise we
215 : * could end up racing with unlink.
216 : */
217 3851 : BTRFS_I(inode)->disk_i_size = inode->i_size;
218 3851 : ret = btrfs_update_inode(trans, root, inode);
219 :
220 3851 : return ret;
221 : fail:
222 0 : return err;
223 : }
224 :
225 :
226 : /*
227 : * conditionally insert an inline extent into the file. This
228 : * does the checks required to make sure the data is small enough
229 : * to fit as an inline extent.
230 : */
231 11214 : static noinline int cow_file_range_inline(struct btrfs_root *root,
232 11214 : struct inode *inode, u64 start,
233 : u64 end, size_t compressed_size,
234 : int compress_type,
235 : struct page **compressed_pages)
236 : {
237 : struct btrfs_trans_handle *trans;
238 11214 : u64 isize = i_size_read(inode);
239 11214 : u64 actual_end = min(end + 1, isize);
240 11214 : u64 inline_len = actual_end - start;
241 11214 : u64 aligned_end = ALIGN(end, root->sectorsize);
242 : u64 data_len = inline_len;
243 : int ret;
244 : struct btrfs_path *path;
245 11214 : int extent_inserted = 0;
246 : u32 extent_item_size;
247 :
248 11214 : if (compressed_size)
249 : data_len = compressed_size;
250 :
251 22428 : if (start > 0 ||
252 15065 : actual_end >= PAGE_CACHE_SIZE ||
253 7702 : data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
254 3851 : (!compressed_size &&
255 7702 : (actual_end & (root->sectorsize - 1)) == 0) ||
256 3851 : end + 1 < isize ||
257 3851 : data_len > root->fs_info->max_inline) {
258 : return 1;
259 : }
260 :
261 3851 : path = btrfs_alloc_path();
262 3851 : if (!path)
263 : return -ENOMEM;
264 :
265 3851 : trans = btrfs_join_transaction(root);
266 3851 : if (IS_ERR(trans)) {
267 0 : btrfs_free_path(path);
268 0 : return PTR_ERR(trans);
269 : }
270 3851 : trans->block_rsv = &root->fs_info->delalloc_block_rsv;
271 :
272 3851 : if (compressed_size && compressed_pages)
273 0 : extent_item_size = btrfs_file_extent_calc_inline_size(
274 : compressed_size);
275 : else
276 3851 : extent_item_size = btrfs_file_extent_calc_inline_size(
277 : inline_len);
278 :
279 3851 : ret = __btrfs_drop_extents(trans, root, inode, path,
280 : start, aligned_end, NULL,
281 : 1, 1, extent_item_size, &extent_inserted);
282 3851 : if (ret) {
283 0 : btrfs_abort_transaction(trans, root, ret);
284 0 : goto out;
285 : }
286 :
287 3851 : if (isize > actual_end)
288 0 : inline_len = min_t(u64, isize, actual_end);
289 3851 : ret = insert_inline_extent(trans, path, extent_inserted,
290 : root, inode, start,
291 : inline_len, compressed_size,
292 : compress_type, compressed_pages);
293 3851 : if (ret && ret != -ENOSPC) {
294 0 : btrfs_abort_transaction(trans, root, ret);
295 0 : goto out;
296 3851 : } else if (ret == -ENOSPC) {
297 : ret = 1;
298 : goto out;
299 : }
300 :
301 : set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
302 3851 : btrfs_delalloc_release_metadata(inode, end + 1 - start);
303 3851 : btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
304 : out:
305 3851 : btrfs_free_path(path);
306 3851 : btrfs_end_transaction(trans, root);
307 3851 : return ret;
308 : }
309 :
310 : struct async_extent {
311 : u64 start;
312 : u64 ram_size;
313 : u64 compressed_size;
314 : struct page **pages;
315 : unsigned long nr_pages;
316 : int compress_type;
317 : struct list_head list;
318 : };
319 :
320 : struct async_cow {
321 : struct inode *inode;
322 : struct btrfs_root *root;
323 : struct page *locked_page;
324 : u64 start;
325 : u64 end;
326 : struct list_head extents;
327 : struct btrfs_work work;
328 : };
329 :
330 234 : static noinline int add_async_extent(struct async_cow *cow,
331 : u64 start, u64 ram_size,
332 : u64 compressed_size,
333 : struct page **pages,
334 : unsigned long nr_pages,
335 : int compress_type)
336 : {
337 : struct async_extent *async_extent;
338 :
339 : async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
340 234 : BUG_ON(!async_extent); /* -ENOMEM */
341 234 : async_extent->start = start;
342 234 : async_extent->ram_size = ram_size;
343 234 : async_extent->compressed_size = compressed_size;
344 234 : async_extent->pages = pages;
345 234 : async_extent->nr_pages = nr_pages;
346 234 : async_extent->compress_type = compress_type;
347 234 : list_add_tail(&async_extent->list, &cow->extents);
348 234 : return 0;
349 : }
350 :
351 : /*
352 : * we create compressed extents in two phases. The first
353 : * phase compresses a range of pages that have already been
354 : * locked (both pages and state bits are locked).
355 : *
356 : * This is done inside an ordered work queue, and the compression
357 : * is spread across many cpus. The actual IO submission is step
358 : * two, and the ordered work queue takes care of making sure that
359 : * happens in the same order things were put onto the queue by
360 : * writepages and friends.
361 : *
362 : * If this code finds it can't get good compression, it puts an
363 : * entry onto the work queue to write the uncompressed bytes. This
364 : * makes sure that both compressed inodes and uncompressed inodes
365 : * are written in the same order that the flusher thread sent them
366 : * down.
367 : */
368 422 : static noinline int compress_file_range(struct inode *inode,
369 81 : struct page *locked_page,
370 : u64 start, u64 end,
371 : struct async_cow *async_cow,
372 : int *num_added)
373 : {
374 211 : struct btrfs_root *root = BTRFS_I(inode)->root;
375 : u64 num_bytes;
376 211 : u64 blocksize = root->sectorsize;
377 : u64 actual_end;
378 211 : u64 isize = i_size_read(inode);
379 : int ret = 0;
380 : struct page **pages = NULL;
381 : unsigned long nr_pages;
382 211 : unsigned long nr_pages_ret = 0;
383 211 : unsigned long total_compressed = 0;
384 211 : unsigned long total_in = 0;
385 : unsigned long max_compressed = 128 * 1024;
386 : unsigned long max_uncompressed = 128 * 1024;
387 : int i;
388 : int will_compress;
389 211 : int compress_type = root->fs_info->compress_type;
390 : int redirty = 0;
391 :
392 : /* if this is a small write inside eof, kick off a defrag */
393 211 : if ((end - start + 1) < 16 * 1024 &&
394 0 : (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
395 90 : btrfs_add_inode_defrag(NULL, inode);
396 :
397 : /*
398 : * skip compression for a small file range(<=blocksize) that
399 : * isn't an inline extent, since it dosen't save disk space at all.
400 : */
401 211 : if ((end - start + 1) <= blocksize &&
402 0 : (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
403 : goto cleanup_and_bail_uncompressed;
404 :
405 133 : actual_end = min_t(u64, isize, end + 1);
406 : again:
407 : will_compress = 0;
408 156 : nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
409 156 : nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE);
410 :
411 : /*
412 : * we don't want to send crud past the end of i_size through
413 : * compression, that's just a waste of CPU time. So, if the
414 : * end of the file is before the start of our current
415 : * requested range of bytes, we bail out to the uncompressed
416 : * cleanup code that can deal with all of this.
417 : *
418 : * It isn't really the fastest way to fix things, but this is a
419 : * very uncommon corner.
420 : */
421 156 : if (actual_end <= start)
422 : goto cleanup_and_bail_uncompressed;
423 :
424 156 : total_compressed = actual_end - start;
425 :
426 : /* we want to make sure that amount of ram required to uncompress
427 : * an extent is reasonable, so we limit the total size in ram
428 : * of a compressed extent to 128k. This is a crucial number
429 : * because it also controls how easily we can spread reads across
430 : * cpus for decompression.
431 : *
432 : * We also want to make sure the amount of IO required to do
433 : * a random read is reasonably small, so we limit the size of
434 : * a compressed extent to 128k.
435 : */
436 156 : total_compressed = min(total_compressed, max_uncompressed);
437 156 : num_bytes = ALIGN(end - start + 1, blocksize);
438 156 : num_bytes = max(blocksize, num_bytes);
439 156 : total_in = 0;
440 : ret = 0;
441 :
442 : /*
443 : * we do compression for mount -o compress and when the
444 : * inode has not been flagged as nocompress. This flag can
445 : * change at any time if we discover bad compression ratios.
446 : */
447 311 : if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) &&
448 183 : (btrfs_test_opt(root, COMPRESS) ||
449 28 : (BTRFS_I(inode)->force_compress) ||
450 0 : (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) {
451 : WARN_ON(pages);
452 155 : pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
453 155 : if (!pages) {
454 : /* just bail out to the uncompressed code */
455 : goto cont;
456 : }
457 :
458 155 : if (BTRFS_I(inode)->force_compress)
459 28 : compress_type = BTRFS_I(inode)->force_compress;
460 :
461 : /*
462 : * we need to call clear_page_dirty_for_io on each
463 : * page in the range. Otherwise applications with the file
464 : * mmap'd can wander in and change the page contents while
465 : * we are compressing them.
466 : *
467 : * If the compression fails for any reason, we set the pages
468 : * dirty again later on.
469 : */
470 155 : extent_range_clear_dirty_for_io(inode, start, end);
471 : redirty = 1;
472 155 : ret = btrfs_compress_pages(compress_type,
473 : inode->i_mapping, start,
474 : total_compressed, pages,
475 : nr_pages, &nr_pages_ret,
476 : &total_in,
477 : &total_compressed,
478 : max_compressed);
479 :
480 155 : if (!ret) {
481 155 : unsigned long offset = total_compressed &
482 : (PAGE_CACHE_SIZE - 1);
483 155 : struct page *page = pages[nr_pages_ret - 1];
484 : char *kaddr;
485 :
486 : /* zero the tail end of the last page, we might be
487 : * sending it down to disk
488 : */
489 155 : if (offset) {
490 : kaddr = kmap_atomic(page);
491 155 : memset(kaddr + offset, 0,
492 : PAGE_CACHE_SIZE - offset);
493 : kunmap_atomic(kaddr);
494 : }
495 : will_compress = 1;
496 : }
497 : }
498 : cont:
499 156 : if (start == 0) {
500 : /* lets try to make an inline extent */
501 6 : if (ret || total_in < (actual_end - start)) {
502 : /* we didn't compress the entire range, try
503 : * to make an uncompressed inline extent.
504 : */
505 4 : ret = cow_file_range_inline(root, inode, start, end,
506 : 0, 0, NULL);
507 : } else {
508 : /* try making a compressed inline extent */
509 2 : ret = cow_file_range_inline(root, inode, start, end,
510 : total_compressed,
511 : compress_type, pages);
512 : }
513 6 : if (ret <= 0) {
514 : unsigned long clear_flags = EXTENT_DELALLOC |
515 : EXTENT_DEFRAG;
516 0 : clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0;
517 :
518 : /*
519 : * inline extent creation worked or returned error,
520 : * we don't need to create any more async work items.
521 : * Unlock and free up our temp pages.
522 : */
523 0 : extent_clear_unlock_delalloc(inode, start, end, NULL,
524 : clear_flags, PAGE_UNLOCK |
525 : PAGE_CLEAR_DIRTY |
526 : PAGE_SET_WRITEBACK |
527 : PAGE_END_WRITEBACK);
528 : goto free_pages_out;
529 : }
530 : }
531 :
532 156 : if (will_compress) {
533 : /*
534 : * we aren't doing an inline extent round the compressed size
535 : * up to a block size boundary so the allocator does sane
536 : * things
537 : */
538 155 : total_compressed = ALIGN(total_compressed, blocksize);
539 :
540 : /*
541 : * one last check to make sure the compression is really a
542 : * win, compare the page count read with the blocks on disk
543 : */
544 155 : total_in = ALIGN(total_in, PAGE_CACHE_SIZE);
545 155 : if (total_compressed >= total_in) {
546 : will_compress = 0;
547 : } else {
548 : num_bytes = total_in;
549 : }
550 : }
551 156 : if (!will_compress && pages) {
552 : /*
553 : * the compression code ran but failed to make things smaller,
554 : * free any pages it allocated and our page pointer array
555 : */
556 2 : for (i = 0; i < nr_pages_ret; i++) {
557 2 : WARN_ON(pages[i]->mapping);
558 2 : page_cache_release(pages[i]);
559 : }
560 2 : kfree(pages);
561 : pages = NULL;
562 2 : total_compressed = 0;
563 2 : nr_pages_ret = 0;
564 :
565 : /* flag the file so we don't compress in the future */
566 4 : if (!btrfs_test_opt(root, FORCE_COMPRESS) &&
567 2 : !(BTRFS_I(inode)->force_compress)) {
568 2 : BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
569 : }
570 : }
571 156 : if (will_compress) {
572 153 : *num_added += 1;
573 :
574 : /* the async work queues will take care of doing actual
575 : * allocation on disk for these compressed pages,
576 : * and will submit them to the elevator.
577 : */
578 153 : add_async_extent(async_cow, start, num_bytes,
579 : total_compressed, pages, nr_pages_ret,
580 : compress_type);
581 :
582 153 : if (start + num_bytes < end) {
583 : start += num_bytes;
584 : pages = NULL;
585 23 : cond_resched();
586 23 : goto again;
587 : }
588 : } else {
589 : cleanup_and_bail_uncompressed:
590 : /*
591 : * No compression, but we still need to write the pages in
592 : * the file we've been given so far. redirty the locked
593 : * page if it corresponds to our extent and set things up
594 : * for the async work queue to run cow_file_range to do
595 : * the normal delalloc dance
596 : */
597 81 : if (page_offset(locked_page) >= start &&
598 : page_offset(locked_page) <= end) {
599 79 : __set_page_dirty_nobuffers(locked_page);
600 : /* unlocked later on in the async handlers */
601 : }
602 81 : if (redirty)
603 2 : extent_range_redirty_for_io(inode, start, end);
604 81 : add_async_extent(async_cow, start, end - start + 1,
605 : 0, NULL, 0, BTRFS_COMPRESS_NONE);
606 81 : *num_added += 1;
607 : }
608 :
609 : out:
610 211 : return ret;
611 :
612 : free_pages_out:
613 0 : for (i = 0; i < nr_pages_ret; i++) {
614 0 : WARN_ON(pages[i]->mapping);
615 0 : page_cache_release(pages[i]);
616 : }
617 0 : kfree(pages);
618 :
619 0 : goto out;
620 : }
621 :
622 : /*
623 : * phase two of compressed writeback. This is the ordered portion
624 : * of the code, which only gets called in the order the work was
625 : * queued. We walk all the async extents created by compress_file_range
626 : * and send them down to the disk.
627 : */
628 211 : static noinline int submit_compressed_extents(struct inode *inode,
629 : struct async_cow *async_cow)
630 : {
631 : struct async_extent *async_extent;
632 : u64 alloc_hint = 0;
633 : struct btrfs_key ins;
634 : struct extent_map *em;
635 211 : struct btrfs_root *root = BTRFS_I(inode)->root;
636 211 : struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
637 : struct extent_io_tree *io_tree;
638 : int ret = 0;
639 :
640 422 : if (list_empty(&async_cow->extents))
641 : return 0;
642 :
643 : again:
644 445 : while (!list_empty(&async_cow->extents)) {
645 234 : async_extent = list_entry(async_cow->extents.next,
646 : struct async_extent, list);
647 234 : list_del(&async_extent->list);
648 :
649 234 : io_tree = &BTRFS_I(inode)->io_tree;
650 :
651 : retry:
652 : /* did the compression code fall back to uncompressed IO? */
653 234 : if (!async_extent->pages) {
654 81 : int page_started = 0;
655 81 : unsigned long nr_written = 0;
656 :
657 81 : lock_extent(io_tree, async_extent->start,
658 162 : async_extent->start +
659 81 : async_extent->ram_size - 1);
660 :
661 : /* allocate blocks */
662 81 : ret = cow_file_range(inode, async_cow->locked_page,
663 : async_extent->start,
664 162 : async_extent->start +
665 81 : async_extent->ram_size - 1,
666 : &page_started, &nr_written, 0);
667 :
668 : /* JDM XXX */
669 :
670 : /*
671 : * if page_started, cow_file_range inserted an
672 : * inline extent and took care of all the unlocking
673 : * and IO for us. Otherwise, we need to submit
674 : * all those pages down to the drive.
675 : */
676 81 : if (!page_started && !ret)
677 81 : extent_write_locked_range(io_tree,
678 : inode, async_extent->start,
679 162 : async_extent->start +
680 81 : async_extent->ram_size - 1,
681 : btrfs_get_extent,
682 : WB_SYNC_ALL);
683 0 : else if (ret)
684 0 : unlock_page(async_cow->locked_page);
685 81 : kfree(async_extent);
686 81 : cond_resched();
687 81 : continue;
688 : }
689 :
690 153 : lock_extent(io_tree, async_extent->start,
691 153 : async_extent->start + async_extent->ram_size - 1);
692 :
693 153 : ret = btrfs_reserve_extent(root,
694 : async_extent->compressed_size,
695 : async_extent->compressed_size,
696 : 0, alloc_hint, &ins, 1, 1);
697 153 : if (ret) {
698 : int i;
699 :
700 0 : for (i = 0; i < async_extent->nr_pages; i++) {
701 0 : WARN_ON(async_extent->pages[i]->mapping);
702 0 : page_cache_release(async_extent->pages[i]);
703 : }
704 0 : kfree(async_extent->pages);
705 0 : async_extent->nr_pages = 0;
706 0 : async_extent->pages = NULL;
707 :
708 0 : if (ret == -ENOSPC) {
709 0 : unlock_extent(io_tree, async_extent->start,
710 0 : async_extent->start +
711 0 : async_extent->ram_size - 1);
712 :
713 : /*
714 : * we need to redirty the pages if we decide to
715 : * fallback to uncompressed IO, otherwise we
716 : * will not submit these pages down to lower
717 : * layers.
718 : */
719 0 : extent_range_redirty_for_io(inode,
720 : async_extent->start,
721 0 : async_extent->start +
722 0 : async_extent->ram_size - 1);
723 :
724 0 : goto retry;
725 : }
726 : goto out_free;
727 : }
728 :
729 : /*
730 : * here we're doing allocation and writeback of the
731 : * compressed pages
732 : */
733 153 : btrfs_drop_extent_cache(inode, async_extent->start,
734 306 : async_extent->start +
735 153 : async_extent->ram_size - 1, 0);
736 :
737 153 : em = alloc_extent_map();
738 153 : if (!em) {
739 : ret = -ENOMEM;
740 : goto out_free_reserve;
741 : }
742 153 : em->start = async_extent->start;
743 153 : em->len = async_extent->ram_size;
744 153 : em->orig_start = em->start;
745 153 : em->mod_start = em->start;
746 153 : em->mod_len = em->len;
747 :
748 153 : em->block_start = ins.objectid;
749 153 : em->block_len = ins.offset;
750 153 : em->orig_block_len = ins.offset;
751 153 : em->ram_bytes = async_extent->ram_size;
752 153 : em->bdev = root->fs_info->fs_devices->latest_bdev;
753 153 : em->compress_type = async_extent->compress_type;
754 : set_bit(EXTENT_FLAG_PINNED, &em->flags);
755 : set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
756 153 : em->generation = -1;
757 :
758 : while (1) {
759 153 : write_lock(&em_tree->lock);
760 153 : ret = add_extent_mapping(em_tree, em, 1);
761 : write_unlock(&em_tree->lock);
762 153 : if (ret != -EEXIST) {
763 153 : free_extent_map(em);
764 : break;
765 : }
766 0 : btrfs_drop_extent_cache(inode, async_extent->start,
767 0 : async_extent->start +
768 0 : async_extent->ram_size - 1, 0);
769 0 : }
770 :
771 153 : if (ret)
772 : goto out_free_reserve;
773 :
774 153 : ret = btrfs_add_ordered_extent_compress(inode,
775 : async_extent->start,
776 : ins.objectid,
777 : async_extent->ram_size,
778 : ins.offset,
779 : BTRFS_ORDERED_COMPRESSED,
780 : async_extent->compress_type);
781 153 : if (ret) {
782 0 : btrfs_drop_extent_cache(inode, async_extent->start,
783 0 : async_extent->start +
784 0 : async_extent->ram_size - 1, 0);
785 0 : goto out_free_reserve;
786 : }
787 :
788 : /*
789 : * clear dirty, set writeback and unlock the pages.
790 : */
791 153 : extent_clear_unlock_delalloc(inode, async_extent->start,
792 306 : async_extent->start +
793 153 : async_extent->ram_size - 1,
794 : NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
795 : PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
796 : PAGE_SET_WRITEBACK);
797 459 : ret = btrfs_submit_compressed_write(inode,
798 : async_extent->start,
799 153 : async_extent->ram_size,
800 : ins.objectid,
801 153 : ins.offset, async_extent->pages,
802 : async_extent->nr_pages);
803 153 : alloc_hint = ins.objectid + ins.offset;
804 153 : kfree(async_extent);
805 153 : if (ret)
806 : goto out;
807 153 : cond_resched();
808 : }
809 : ret = 0;
810 : out:
811 211 : return ret;
812 : out_free_reserve:
813 0 : btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
814 : out_free:
815 0 : extent_clear_unlock_delalloc(inode, async_extent->start,
816 0 : async_extent->start +
817 0 : async_extent->ram_size - 1,
818 : NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
819 : EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
820 : PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
821 : PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK);
822 0 : kfree(async_extent);
823 0 : goto again;
824 : }
825 :
826 45642 : static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
827 : u64 num_bytes)
828 : {
829 45642 : struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
830 : struct extent_map *em;
831 : u64 alloc_hint = 0;
832 :
833 45642 : read_lock(&em_tree->lock);
834 45645 : em = search_extent_mapping(em_tree, start, num_bytes);
835 45641 : if (em) {
836 : /*
837 : * if block start isn't an actual block number then find the
838 : * first block in this inode and use that as a hint. If that
839 : * block is also bogus then just don't worry about it.
840 : */
841 38468 : if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
842 30273 : free_extent_map(em);
843 30270 : em = search_extent_mapping(em_tree, 0, 0);
844 30272 : if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
845 : alloc_hint = em->block_start;
846 30272 : if (em)
847 30272 : free_extent_map(em);
848 : } else {
849 : alloc_hint = em->block_start;
850 8195 : free_extent_map(em);
851 : }
852 : }
853 : read_unlock(&em_tree->lock);
854 :
855 45645 : return alloc_hint;
856 : }
857 :
858 : /*
859 : * when extent_io.c finds a delayed allocation range in the file,
860 : * the call backs end up in this code. The basic idea is to
861 : * allocate extents on disk for the range, and create ordered data structs
862 : * in ram to track those extents.
863 : *
864 : * locked_page is the page that writepage had locked already. We use
865 : * it to make sure we don't do extra locks or unlocks.
866 : *
867 : * *page_started is set to one if we unlock locked_page and do everything
868 : * required to start IO on it. It may be clean and already done with
869 : * IO when we return.
870 : */
871 24236 : static noinline int cow_file_range(struct inode *inode,
872 : struct page *locked_page,
873 : u64 start, u64 end, int *page_started,
874 : unsigned long *nr_written,
875 : int unlock)
876 : {
877 24236 : struct btrfs_root *root = BTRFS_I(inode)->root;
878 : u64 alloc_hint = 0;
879 : u64 num_bytes;
880 : unsigned long ram_size;
881 : u64 disk_num_bytes;
882 : u64 cur_alloc_size;
883 24236 : u64 blocksize = root->sectorsize;
884 : struct btrfs_key ins;
885 : struct extent_map *em;
886 24236 : struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
887 : int ret = 0;
888 :
889 24236 : if (btrfs_is_free_space_inode(inode)) {
890 0 : WARN_ON_ONCE(1);
891 : ret = -EINVAL;
892 : goto out_unlock;
893 : }
894 :
895 24236 : num_bytes = ALIGN(end - start + 1, blocksize);
896 24236 : num_bytes = max(blocksize, num_bytes);
897 : disk_num_bytes = num_bytes;
898 :
899 : /* if this is a small write inside eof, kick off defrag */
900 24236 : if (num_bytes < 64 * 1024 &&
901 7774 : (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
902 11818 : btrfs_add_inode_defrag(NULL, inode);
903 :
904 24235 : if (start == 0) {
905 : /* lets try to make an inline extent */
906 11208 : ret = cow_file_range_inline(root, inode, start, end, 0, 0,
907 : NULL);
908 11208 : if (ret == 0) {
909 3851 : extent_clear_unlock_delalloc(inode, start, end, NULL,
910 : EXTENT_LOCKED | EXTENT_DELALLOC |
911 : EXTENT_DEFRAG, PAGE_UNLOCK |
912 : PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
913 : PAGE_END_WRITEBACK);
914 :
915 7702 : *nr_written = *nr_written +
916 3851 : (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
917 3851 : *page_started = 1;
918 3851 : goto out;
919 7357 : } else if (ret < 0) {
920 : goto out_unlock;
921 : }
922 : }
923 :
924 40768 : BUG_ON(disk_num_bytes >
925 : btrfs_super_total_bytes(root->fs_info->super_copy));
926 :
927 20384 : alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
928 20385 : btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
929 :
930 61153 : while (disk_num_bytes > 0) {
931 : unsigned long op;
932 :
933 : cur_alloc_size = disk_num_bytes;
934 20384 : ret = btrfs_reserve_extent(root, cur_alloc_size,
935 20384 : root->sectorsize, 0, alloc_hint,
936 : &ins, 1, 1);
937 20385 : if (ret < 0)
938 : goto out_unlock;
939 :
940 20385 : em = alloc_extent_map();
941 20384 : if (!em) {
942 : ret = -ENOMEM;
943 : goto out_reserve;
944 : }
945 20384 : em->start = start;
946 20384 : em->orig_start = em->start;
947 20384 : ram_size = ins.offset;
948 20384 : em->len = ins.offset;
949 20384 : em->mod_start = em->start;
950 20384 : em->mod_len = em->len;
951 :
952 20384 : em->block_start = ins.objectid;
953 20384 : em->block_len = ins.offset;
954 20384 : em->orig_block_len = ins.offset;
955 20384 : em->ram_bytes = ram_size;
956 20384 : em->bdev = root->fs_info->fs_devices->latest_bdev;
957 : set_bit(EXTENT_FLAG_PINNED, &em->flags);
958 20384 : em->generation = -1;
959 :
960 : while (1) {
961 20384 : write_lock(&em_tree->lock);
962 20385 : ret = add_extent_mapping(em_tree, em, 1);
963 : write_unlock(&em_tree->lock);
964 20382 : if (ret != -EEXIST) {
965 20382 : free_extent_map(em);
966 : break;
967 : }
968 0 : btrfs_drop_extent_cache(inode, start,
969 0 : start + ram_size - 1, 0);
970 0 : }
971 20385 : if (ret)
972 : goto out_reserve;
973 :
974 20385 : cur_alloc_size = ins.offset;
975 20385 : ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
976 : ram_size, cur_alloc_size, 0);
977 20385 : if (ret)
978 : goto out_drop_extent_cache;
979 :
980 20385 : if (root->root_key.objectid ==
981 : BTRFS_DATA_RELOC_TREE_OBJECTID) {
982 0 : ret = btrfs_reloc_clone_csums(inode, start,
983 : cur_alloc_size);
984 0 : if (ret)
985 : goto out_drop_extent_cache;
986 : }
987 :
988 20385 : if (disk_num_bytes < cur_alloc_size)
989 : break;
990 :
991 : /* we're not doing compressed IO, don't unlock the first
992 : * page (which the caller expects to stay locked), don't
993 : * clear any dirty bits and don't set any writeback bits
994 : *
995 : * Do set the Private2 bit so we know this page was properly
996 : * setup for writepage
997 : */
998 20385 : op = unlock ? PAGE_UNLOCK : 0;
999 20385 : op |= PAGE_SET_PRIVATE2;
1000 :
1001 20385 : extent_clear_unlock_delalloc(inode, start,
1002 20385 : start + ram_size - 1, locked_page,
1003 : EXTENT_LOCKED | EXTENT_DELALLOC,
1004 : op);
1005 20385 : disk_num_bytes -= cur_alloc_size;
1006 : num_bytes -= cur_alloc_size;
1007 20385 : alloc_hint = ins.objectid + ins.offset;
1008 20385 : start += cur_alloc_size;
1009 : }
1010 : out:
1011 24236 : return ret;
1012 :
1013 : out_drop_extent_cache:
1014 0 : btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0);
1015 : out_reserve:
1016 0 : btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
1017 : out_unlock:
1018 0 : extent_clear_unlock_delalloc(inode, start, end, locked_page,
1019 : EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
1020 : EXTENT_DELALLOC | EXTENT_DEFRAG,
1021 : PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
1022 : PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK);
1023 0 : goto out;
1024 : }
1025 :
1026 : /*
1027 : * work queue call back to started compression on a file and pages
1028 : */
1029 211 : static noinline void async_cow_start(struct btrfs_work *work)
1030 : {
1031 : struct async_cow *async_cow;
1032 211 : int num_added = 0;
1033 211 : async_cow = container_of(work, struct async_cow, work);
1034 :
1035 211 : compress_file_range(async_cow->inode, async_cow->locked_page,
1036 : async_cow->start, async_cow->end, async_cow,
1037 : &num_added);
1038 211 : if (num_added == 0) {
1039 0 : btrfs_add_delayed_iput(async_cow->inode);
1040 0 : async_cow->inode = NULL;
1041 : }
1042 211 : }
1043 :
1044 : /*
1045 : * work queue call back to submit previously compressed pages
1046 : */
1047 211 : static noinline void async_cow_submit(struct btrfs_work *work)
1048 : {
1049 : struct async_cow *async_cow;
1050 : struct btrfs_root *root;
1051 : unsigned long nr_pages;
1052 :
1053 211 : async_cow = container_of(work, struct async_cow, work);
1054 :
1055 211 : root = async_cow->root;
1056 211 : nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
1057 : PAGE_CACHE_SHIFT;
1058 :
1059 422 : if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) <
1060 211 : 5 * 1024 * 1024 &&
1061 211 : waitqueue_active(&root->fs_info->async_submit_wait))
1062 0 : wake_up(&root->fs_info->async_submit_wait);
1063 :
1064 211 : if (async_cow->inode)
1065 211 : submit_compressed_extents(async_cow->inode, async_cow);
1066 211 : }
1067 :
1068 211 : static noinline void async_cow_free(struct btrfs_work *work)
1069 : {
1070 : struct async_cow *async_cow;
1071 211 : async_cow = container_of(work, struct async_cow, work);
1072 211 : if (async_cow->inode)
1073 211 : btrfs_add_delayed_iput(async_cow->inode);
1074 211 : kfree(async_cow);
1075 211 : }
1076 :
1077 208 : static int cow_file_range_async(struct inode *inode, struct page *locked_page,
1078 : u64 start, u64 end, int *page_started,
1079 : unsigned long *nr_written)
1080 : {
1081 : struct async_cow *async_cow;
1082 208 : struct btrfs_root *root = BTRFS_I(inode)->root;
1083 : unsigned long nr_pages;
1084 : u64 cur_end;
1085 : int limit = 10 * 1024 * 1024;
1086 :
1087 208 : clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
1088 : 1, 0, NULL, GFP_NOFS);
1089 627 : while (start < end) {
1090 : async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
1091 211 : BUG_ON(!async_cow); /* -ENOMEM */
1092 211 : async_cow->inode = igrab(inode);
1093 211 : async_cow->root = root;
1094 211 : async_cow->locked_page = locked_page;
1095 211 : async_cow->start = start;
1096 :
1097 211 : if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
1098 : cur_end = end;
1099 : else
1100 208 : cur_end = min(end, start + 512 * 1024 - 1);
1101 :
1102 211 : async_cow->end = cur_end;
1103 211 : INIT_LIST_HEAD(&async_cow->extents);
1104 :
1105 211 : btrfs_init_work(&async_cow->work,
1106 : btrfs_delalloc_helper,
1107 : async_cow_start, async_cow_submit,
1108 : async_cow_free);
1109 :
1110 211 : nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
1111 : PAGE_CACHE_SHIFT;
1112 211 : atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
1113 :
1114 211 : btrfs_queue_work(root->fs_info->delalloc_workers,
1115 : &async_cow->work);
1116 :
1117 422 : if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) {
1118 0 : wait_event(root->fs_info->async_submit_wait,
1119 : (atomic_read(&root->fs_info->async_delalloc_pages) <
1120 : limit));
1121 : }
1122 :
1123 422 : while (atomic_read(&root->fs_info->async_submit_draining) &&
1124 : atomic_read(&root->fs_info->async_delalloc_pages)) {
1125 0 : wait_event(root->fs_info->async_submit_wait,
1126 : (atomic_read(&root->fs_info->async_delalloc_pages) ==
1127 : 0));
1128 : }
1129 :
1130 211 : *nr_written += nr_pages;
1131 211 : start = cur_end + 1;
1132 : }
1133 208 : *page_started = 1;
1134 208 : return 0;
1135 : }
1136 :
1137 5707 : static noinline int csum_exist_in_range(struct btrfs_root *root,
1138 : u64 bytenr, u64 num_bytes)
1139 : {
1140 : int ret;
1141 : struct btrfs_ordered_sum *sums;
1142 5707 : LIST_HEAD(list);
1143 :
1144 5707 : ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr,
1145 5707 : bytenr + num_bytes - 1, &list, 0);
1146 11414 : if (ret == 0 && list_empty(&list))
1147 : return 0;
1148 :
1149 0 : while (!list_empty(&list)) {
1150 0 : sums = list_entry(list.next, struct btrfs_ordered_sum, list);
1151 0 : list_del(&sums->list);
1152 0 : kfree(sums);
1153 : }
1154 : return 1;
1155 : }
1156 :
1157 : /*
1158 : * when nowcow writeback call back. This checks for snapshots or COW copies
1159 : * of the extents that exist in the file, and COWs the file as required.
1160 : *
1161 : * If no cow copies or snapshots exist, we write directly to the existing
1162 : * blocks on disk
1163 : */
1164 13707 : static noinline int run_delalloc_nocow(struct inode *inode,
1165 : struct page *locked_page,
1166 : u64 start, u64 end, int *page_started, int force,
1167 : unsigned long *nr_written)
1168 : {
1169 19415 : struct btrfs_root *root = BTRFS_I(inode)->root;
1170 : struct btrfs_trans_handle *trans;
1171 14214 : struct extent_buffer *leaf;
1172 : struct btrfs_path *path;
1173 : struct btrfs_file_extent_item *fi;
1174 : struct btrfs_key found_key;
1175 : u64 cow_start;
1176 : u64 cur_offset;
1177 : u64 extent_end;
1178 : u64 extent_offset;
1179 : u64 disk_bytenr;
1180 : u64 num_bytes;
1181 : u64 disk_num_bytes;
1182 : u64 ram_bytes;
1183 : int extent_type;
1184 : int ret, err;
1185 : int type;
1186 : int nocow;
1187 : int check_prev = 1;
1188 : bool nolock;
1189 : u64 ino = btrfs_ino(inode);
1190 :
1191 13707 : path = btrfs_alloc_path();
1192 13707 : if (!path) {
1193 0 : extent_clear_unlock_delalloc(inode, start, end, locked_page,
1194 : EXTENT_LOCKED | EXTENT_DELALLOC |
1195 : EXTENT_DO_ACCOUNTING |
1196 : EXTENT_DEFRAG, PAGE_UNLOCK |
1197 : PAGE_CLEAR_DIRTY |
1198 : PAGE_SET_WRITEBACK |
1199 : PAGE_END_WRITEBACK);
1200 0 : return -ENOMEM;
1201 : }
1202 :
1203 13707 : nolock = btrfs_is_free_space_inode(inode);
1204 :
1205 13707 : if (nolock)
1206 4057 : trans = btrfs_join_transaction_nolock(root);
1207 : else
1208 9650 : trans = btrfs_join_transaction(root);
1209 :
1210 13707 : if (IS_ERR(trans)) {
1211 0 : extent_clear_unlock_delalloc(inode, start, end, locked_page,
1212 : EXTENT_LOCKED | EXTENT_DELALLOC |
1213 : EXTENT_DO_ACCOUNTING |
1214 : EXTENT_DEFRAG, PAGE_UNLOCK |
1215 : PAGE_CLEAR_DIRTY |
1216 : PAGE_SET_WRITEBACK |
1217 : PAGE_END_WRITEBACK);
1218 0 : btrfs_free_path(path);
1219 0 : return PTR_ERR(trans);
1220 : }
1221 :
1222 13707 : trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1223 :
1224 : cow_start = (u64)-1;
1225 : cur_offset = start;
1226 : while (1) {
1227 13749 : ret = btrfs_lookup_file_extent(trans, root, path, ino,
1228 : cur_offset, 0);
1229 13749 : if (ret < 0)
1230 : goto error;
1231 13749 : if (ret > 0 && path->slots[0] > 0 && check_prev) {
1232 2639 : leaf = path->nodes[0];
1233 2639 : btrfs_item_key_to_cpu(leaf, &found_key,
1234 : path->slots[0] - 1);
1235 5278 : if (found_key.objectid == ino &&
1236 2639 : found_key.type == BTRFS_EXTENT_DATA_KEY)
1237 2639 : path->slots[0]--;
1238 : }
1239 : check_prev = 0;
1240 : next_slot:
1241 14214 : leaf = path->nodes[0];
1242 28428 : if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1243 4 : ret = btrfs_next_leaf(root, path);
1244 4 : if (ret < 0)
1245 : goto error;
1246 4 : if (ret > 0)
1247 : break;
1248 4 : leaf = path->nodes[0];
1249 : }
1250 :
1251 : nocow = 0;
1252 : disk_bytenr = 0;
1253 : num_bytes = 0;
1254 14214 : btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1255 :
1256 28414 : if (found_key.objectid > ino ||
1257 28400 : found_key.type > BTRFS_EXTENT_DATA_KEY ||
1258 14200 : found_key.offset > end)
1259 : break;
1260 :
1261 14199 : if (found_key.offset > cur_offset) {
1262 : extent_end = found_key.offset;
1263 : extent_type = 0;
1264 : goto out_check;
1265 : }
1266 :
1267 28398 : fi = btrfs_item_ptr(leaf, path->slots[0],
1268 : struct btrfs_file_extent_item);
1269 14199 : extent_type = btrfs_file_extent_type(leaf, fi);
1270 :
1271 : ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
1272 14199 : if (extent_type == BTRFS_FILE_EXTENT_REG ||
1273 : extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1274 : disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1275 : extent_offset = btrfs_file_extent_offset(leaf, fi);
1276 28397 : extent_end = found_key.offset +
1277 : btrfs_file_extent_num_bytes(leaf, fi);
1278 : disk_num_bytes =
1279 : btrfs_file_extent_disk_num_bytes(leaf, fi);
1280 14199 : if (extent_end <= start) {
1281 0 : path->slots[0]++;
1282 0 : goto next_slot;
1283 : }
1284 14199 : if (disk_bytenr == 0)
1285 : goto out_check;
1286 26239 : if (btrfs_file_extent_compression(leaf, fi) ||
1287 13114 : btrfs_file_extent_encryption(leaf, fi) ||
1288 : btrfs_file_extent_other_encoding(leaf, fi))
1289 : goto out_check;
1290 13114 : if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
1291 : goto out_check;
1292 6754 : if (btrfs_extent_readonly(root, disk_bytenr))
1293 : goto out_check;
1294 6754 : if (btrfs_cross_ref_exist(trans, root, ino,
1295 6754 : found_key.offset -
1296 : extent_offset, disk_bytenr))
1297 : goto out_check;
1298 6654 : disk_bytenr += extent_offset;
1299 6654 : disk_bytenr += cur_offset - found_key.offset;
1300 6654 : num_bytes = min(end + 1, extent_end) - cur_offset;
1301 : /*
1302 : * if there are pending snapshots for this root,
1303 : * we fall into common COW way.
1304 : */
1305 6654 : if (!nolock) {
1306 2597 : err = btrfs_start_nocow_write(root);
1307 2597 : if (!err)
1308 : goto out_check;
1309 : }
1310 : /*
1311 : * force cow if csum exists in the range.
1312 : * this ensure that csum for a given extent are
1313 : * either valid or do not exist.
1314 : */
1315 5708 : if (csum_exist_in_range(root, disk_bytenr, num_bytes))
1316 : goto out_check;
1317 : nocow = 1;
1318 0 : } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1319 0 : extent_end = found_key.offset +
1320 0 : btrfs_file_extent_inline_len(leaf,
1321 : path->slots[0], fi);
1322 0 : extent_end = ALIGN(extent_end, root->sectorsize);
1323 : } else {
1324 0 : BUG_ON(1);
1325 : }
1326 : out_check:
1327 14198 : if (extent_end <= start) {
1328 0 : path->slots[0]++;
1329 0 : if (!nolock && nocow)
1330 0 : btrfs_end_nocow_write(root);
1331 : goto next_slot;
1332 : }
1333 14198 : if (!nocow) {
1334 8491 : if (cow_start == (u64)-1)
1335 : cow_start = cur_offset;
1336 : cur_offset = extent_end;
1337 8491 : if (cur_offset > end)
1338 : break;
1339 465 : path->slots[0]++;
1340 465 : goto next_slot;
1341 : }
1342 :
1343 5707 : btrfs_release_path(path);
1344 5707 : if (cow_start != (u64)-1) {
1345 147 : ret = cow_file_range(inode, locked_page,
1346 147 : cow_start, found_key.offset - 1,
1347 : page_started, nr_written, 1);
1348 147 : if (ret) {
1349 0 : if (!nolock && nocow)
1350 0 : btrfs_end_nocow_write(root);
1351 : goto error;
1352 : }
1353 : cow_start = (u64)-1;
1354 : }
1355 :
1356 5707 : if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1357 : struct extent_map *em;
1358 : struct extent_map_tree *em_tree;
1359 5625 : em_tree = &BTRFS_I(inode)->extent_tree;
1360 5625 : em = alloc_extent_map();
1361 5625 : BUG_ON(!em); /* -ENOMEM */
1362 5625 : em->start = cur_offset;
1363 5625 : em->orig_start = found_key.offset - extent_offset;
1364 5625 : em->len = num_bytes;
1365 5625 : em->block_len = num_bytes;
1366 5625 : em->block_start = disk_bytenr;
1367 5625 : em->orig_block_len = disk_num_bytes;
1368 5625 : em->ram_bytes = ram_bytes;
1369 5625 : em->bdev = root->fs_info->fs_devices->latest_bdev;
1370 5625 : em->mod_start = em->start;
1371 5625 : em->mod_len = em->len;
1372 : set_bit(EXTENT_FLAG_PINNED, &em->flags);
1373 : set_bit(EXTENT_FLAG_FILLING, &em->flags);
1374 5625 : em->generation = -1;
1375 : while (1) {
1376 11250 : write_lock(&em_tree->lock);
1377 11250 : ret = add_extent_mapping(em_tree, em, 1);
1378 : write_unlock(&em_tree->lock);
1379 11250 : if (ret != -EEXIST) {
1380 5625 : free_extent_map(em);
1381 : break;
1382 : }
1383 5625 : btrfs_drop_extent_cache(inode, em->start,
1384 5625 : em->start + em->len - 1, 0);
1385 5625 : }
1386 : type = BTRFS_ORDERED_PREALLOC;
1387 : } else {
1388 : type = BTRFS_ORDERED_NOCOW;
1389 : }
1390 :
1391 5707 : ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
1392 : num_bytes, num_bytes, type);
1393 5707 : BUG_ON(ret); /* -ENOMEM */
1394 :
1395 5707 : if (root->root_key.objectid ==
1396 : BTRFS_DATA_RELOC_TREE_OBJECTID) {
1397 1021 : ret = btrfs_reloc_clone_csums(inode, cur_offset,
1398 : num_bytes);
1399 1021 : if (ret) {
1400 0 : if (!nolock && nocow)
1401 0 : btrfs_end_nocow_write(root);
1402 : goto error;
1403 : }
1404 : }
1405 :
1406 5707 : extent_clear_unlock_delalloc(inode, cur_offset,
1407 5707 : cur_offset + num_bytes - 1,
1408 : locked_page, EXTENT_LOCKED |
1409 : EXTENT_DELALLOC, PAGE_UNLOCK |
1410 : PAGE_SET_PRIVATE2);
1411 5707 : if (!nolock && nocow)
1412 1650 : btrfs_end_nocow_write(root);
1413 : cur_offset = extent_end;
1414 5707 : if (cur_offset > end)
1415 : break;
1416 : }
1417 13706 : btrfs_release_path(path);
1418 :
1419 13707 : if (cur_offset <= end && cow_start == (u64)-1) {
1420 : cow_start = cur_offset;
1421 : cur_offset = end;
1422 : }
1423 :
1424 13707 : if (cow_start != (u64)-1) {
1425 8042 : ret = cow_file_range(inode, locked_page, cow_start, end,
1426 : page_started, nr_written, 1);
1427 : if (ret)
1428 : goto error;
1429 : }
1430 :
1431 : error:
1432 13707 : err = btrfs_end_transaction(trans, root);
1433 13707 : if (!ret)
1434 : ret = err;
1435 :
1436 13707 : if (ret && cur_offset < end)
1437 0 : extent_clear_unlock_delalloc(inode, cur_offset, end,
1438 : locked_page, EXTENT_LOCKED |
1439 : EXTENT_DELALLOC | EXTENT_DEFRAG |
1440 : EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
1441 : PAGE_CLEAR_DIRTY |
1442 : PAGE_SET_WRITEBACK |
1443 : PAGE_END_WRITEBACK);
1444 13707 : btrfs_free_path(path);
1445 13707 : return ret;
1446 : }
1447 :
1448 : /*
1449 : * extent_io.c call back to do delayed allocation processing
1450 : */
1451 29881 : static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1452 : u64 start, u64 end, int *page_started,
1453 : unsigned long *nr_written)
1454 : {
1455 : int ret;
1456 29881 : struct btrfs_root *root = BTRFS_I(inode)->root;
1457 :
1458 29881 : if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) {
1459 4057 : ret = run_delalloc_nocow(inode, locked_page, start, end,
1460 : page_started, 1, nr_written);
1461 25824 : } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) {
1462 9650 : ret = run_delalloc_nocow(inode, locked_page, start, end,
1463 : page_started, 0, nr_written);
1464 32144 : } else if (!btrfs_test_opt(root, COMPRESS) &&
1465 31936 : !(BTRFS_I(inode)->force_compress) &&
1466 15966 : !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS)) {
1467 15966 : ret = cow_file_range(inode, locked_page, start, end,
1468 : page_started, nr_written, 1);
1469 : } else {
1470 : set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1471 : &BTRFS_I(inode)->runtime_flags);
1472 208 : ret = cow_file_range_async(inode, locked_page, start, end,
1473 : page_started, nr_written);
1474 : }
1475 29881 : return ret;
1476 : }
1477 :
1478 597391 : static void btrfs_split_extent_hook(struct inode *inode,
1479 : struct extent_state *orig, u64 split)
1480 : {
1481 : /* not delalloc, ignore it */
1482 597391 : if (!(orig->state & EXTENT_DELALLOC))
1483 597391 : return;
1484 :
1485 : spin_lock(&BTRFS_I(inode)->lock);
1486 27418 : BTRFS_I(inode)->outstanding_extents++;
1487 : spin_unlock(&BTRFS_I(inode)->lock);
1488 : }
1489 :
1490 : /*
1491 : * extent_io.c merge_extent_hook, used to track merged delayed allocation
1492 : * extents so we can keep track of new extents that are just merged onto old
1493 : * extents, such as when we are doing sequential writes, so we can properly
1494 : * account for the metadata space we'll need.
1495 : */
1496 555603 : static void btrfs_merge_extent_hook(struct inode *inode,
1497 : struct extent_state *new,
1498 : struct extent_state *other)
1499 : {
1500 : /* not delalloc, ignore it */
1501 555603 : if (!(other->state & EXTENT_DELALLOC))
1502 555603 : return;
1503 :
1504 : spin_lock(&BTRFS_I(inode)->lock);
1505 116028 : BTRFS_I(inode)->outstanding_extents--;
1506 : spin_unlock(&BTRFS_I(inode)->lock);
1507 : }
1508 :
1509 48376 : static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
1510 : struct inode *inode)
1511 : {
1512 : spin_lock(&root->delalloc_lock);
1513 96758 : if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1514 48379 : list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1515 : &root->delalloc_inodes);
1516 : set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1517 : &BTRFS_I(inode)->runtime_flags);
1518 48377 : root->nr_delalloc_inodes++;
1519 48377 : if (root->nr_delalloc_inodes == 1) {
1520 32365 : spin_lock(&root->fs_info->delalloc_root_lock);
1521 64734 : BUG_ON(!list_empty(&root->delalloc_root));
1522 32367 : list_add_tail(&root->delalloc_root,
1523 32367 : &root->fs_info->delalloc_roots);
1524 32367 : spin_unlock(&root->fs_info->delalloc_root_lock);
1525 : }
1526 : }
1527 : spin_unlock(&root->delalloc_lock);
1528 48378 : }
1529 :
1530 48378 : static void btrfs_del_delalloc_inode(struct btrfs_root *root,
1531 : struct inode *inode)
1532 : {
1533 : spin_lock(&root->delalloc_lock);
1534 96758 : if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1535 : list_del_init(&BTRFS_I(inode)->delalloc_inodes);
1536 : clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1537 : &BTRFS_I(inode)->runtime_flags);
1538 48377 : root->nr_delalloc_inodes--;
1539 48377 : if (!root->nr_delalloc_inodes) {
1540 32366 : spin_lock(&root->fs_info->delalloc_root_lock);
1541 64734 : BUG_ON(list_empty(&root->delalloc_root));
1542 : list_del_init(&root->delalloc_root);
1543 32367 : spin_unlock(&root->fs_info->delalloc_root_lock);
1544 : }
1545 : }
1546 : spin_unlock(&root->delalloc_lock);
1547 48378 : }
1548 :
1549 : /*
1550 : * extent_io.c set_bit_hook, used to track delayed allocation
1551 : * bytes in this file, and to maintain the list of inodes that
1552 : * have pending delalloc work to be done.
1553 : */
1554 1083201 : static void btrfs_set_bit_hook(struct inode *inode,
1555 : struct extent_state *state, unsigned long *bits)
1556 : {
1557 :
1558 : /*
1559 : * set_bit and clear bit hooks normally require _irqsave/restore
1560 : * but in this case, we are only testing for the DELALLOC
1561 : * bit, which is only set or cleared with irqs on
1562 : */
1563 1083201 : if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1564 178033 : struct btrfs_root *root = BTRFS_I(inode)->root;
1565 178033 : u64 len = state->end + 1 - state->start;
1566 178033 : bool do_list = !btrfs_is_free_space_inode(inode);
1567 :
1568 178032 : if (*bits & EXTENT_FIRST_DELALLOC) {
1569 158382 : *bits &= ~EXTENT_FIRST_DELALLOC;
1570 : } else {
1571 : spin_lock(&BTRFS_I(inode)->lock);
1572 19650 : BTRFS_I(inode)->outstanding_extents++;
1573 : spin_unlock(&BTRFS_I(inode)->lock);
1574 : }
1575 :
1576 178032 : __percpu_counter_add(&root->fs_info->delalloc_bytes, len,
1577 178032 : root->fs_info->delalloc_batch);
1578 : spin_lock(&BTRFS_I(inode)->lock);
1579 178034 : BTRFS_I(inode)->delalloc_bytes += len;
1580 352013 : if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1581 : &BTRFS_I(inode)->runtime_flags))
1582 48376 : btrfs_add_delalloc_inodes(root, inode);
1583 : spin_unlock(&BTRFS_I(inode)->lock);
1584 : }
1585 1083205 : }
1586 :
1587 : /*
1588 : * extent_io.c clear_bit_hook, see set_bit_hook for why
1589 : */
1590 858660 : static void btrfs_clear_bit_hook(struct inode *inode,
1591 : struct extent_state *state,
1592 : unsigned long *bits)
1593 : {
1594 : /*
1595 : * set_bit and clear bit hooks normally require _irqsave/restore
1596 : * but in this case, we are only testing for the DELALLOC
1597 : * bit, which is only set or cleared with irqs on
1598 : */
1599 858660 : if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1600 89406 : struct btrfs_root *root = BTRFS_I(inode)->root;
1601 89406 : u64 len = state->end + 1 - state->start;
1602 89406 : bool do_list = !btrfs_is_free_space_inode(inode);
1603 :
1604 89409 : if (*bits & EXTENT_FIRST_DELALLOC) {
1605 89408 : *bits &= ~EXTENT_FIRST_DELALLOC;
1606 1 : } else if (!(*bits & EXTENT_DO_ACCOUNTING)) {
1607 : spin_lock(&BTRFS_I(inode)->lock);
1608 1 : BTRFS_I(inode)->outstanding_extents--;
1609 : spin_unlock(&BTRFS_I(inode)->lock);
1610 : }
1611 :
1612 : /*
1613 : * We don't reserve metadata space for space cache inodes so we
1614 : * don't need to call dellalloc_release_metadata if there is an
1615 : * error.
1616 : */
1617 123471 : if (*bits & EXTENT_DO_ACCOUNTING &&
1618 34063 : root != root->fs_info->tree_root)
1619 34063 : btrfs_delalloc_release_metadata(inode, len);
1620 :
1621 89408 : if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
1622 88386 : && do_list && !(state->state & EXTENT_NORESERVE))
1623 84330 : btrfs_free_reserved_data_space(inode, len);
1624 :
1625 89419 : __percpu_counter_add(&root->fs_info->delalloc_bytes, -len,
1626 89419 : root->fs_info->delalloc_batch);
1627 : spin_lock(&BTRFS_I(inode)->lock);
1628 89420 : BTRFS_I(inode)->delalloc_bytes -= len;
1629 137799 : if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
1630 : test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1631 : &BTRFS_I(inode)->runtime_flags))
1632 48378 : btrfs_del_delalloc_inode(root, inode);
1633 : spin_unlock(&BTRFS_I(inode)->lock);
1634 : }
1635 858673 : }
1636 :
1637 : /*
1638 : * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
1639 : * we don't create bios that span stripes or chunks
1640 : */
1641 1495689 : int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
1642 : size_t size, struct bio *bio,
1643 : unsigned long bio_flags)
1644 : {
1645 1495689 : struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
1646 1495689 : u64 logical = (u64)bio->bi_iter.bi_sector << 9;
1647 : u64 length = 0;
1648 : u64 map_length;
1649 : int ret;
1650 :
1651 1495689 : if (bio_flags & EXTENT_BIO_COMPRESSED)
1652 : return 0;
1653 :
1654 1495399 : length = bio->bi_iter.bi_size;
1655 1495399 : map_length = length;
1656 1495399 : ret = btrfs_map_block(root->fs_info, rw, logical,
1657 : &map_length, NULL, 0);
1658 : /* Will always return 0 with map_multi == NULL */
1659 1495418 : BUG_ON(ret < 0);
1660 1495418 : if (map_length < length + size)
1661 : return 1;
1662 1447382 : return 0;
1663 : }
1664 :
1665 : /*
1666 : * in order to insert checksums into the metadata in large chunks,
1667 : * we wait until bio submission time. All the pages in the bio are
1668 : * checksummed and sums are attached onto the ordered extent record.
1669 : *
1670 : * At IO completion time the cums attached on the ordered extent record
1671 : * are inserted into the btree
1672 : */
1673 59544 : static int __btrfs_submit_bio_start(struct inode *inode, int rw,
1674 : struct bio *bio, int mirror_num,
1675 : unsigned long bio_flags,
1676 : u64 bio_offset)
1677 : {
1678 59544 : struct btrfs_root *root = BTRFS_I(inode)->root;
1679 : int ret = 0;
1680 :
1681 59544 : ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
1682 59530 : BUG_ON(ret); /* -ENOMEM */
1683 59530 : return 0;
1684 : }
1685 :
1686 : /*
1687 : * in order to insert checksums into the metadata in large chunks,
1688 : * we wait until bio submission time. All the pages in the bio are
1689 : * checksummed and sums are attached onto the ordered extent record.
1690 : *
1691 : * At IO completion time the cums attached on the ordered extent record
1692 : * are inserted into the btree
1693 : */
1694 59547 : static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
1695 : int mirror_num, unsigned long bio_flags,
1696 : u64 bio_offset)
1697 : {
1698 59547 : struct btrfs_root *root = BTRFS_I(inode)->root;
1699 : int ret;
1700 :
1701 59547 : ret = btrfs_map_bio(root, rw, bio, mirror_num, 1);
1702 59547 : if (ret)
1703 0 : bio_endio(bio, ret);
1704 59547 : return ret;
1705 : }
1706 :
1707 : /*
1708 : * extent_io.c submission hook. This does the right thing for csum calculation
1709 : * on write, or reading the csums from the tree before a read
1710 : */
1711 92080 : static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1712 : int mirror_num, unsigned long bio_flags,
1713 : u64 bio_offset)
1714 : {
1715 92080 : struct btrfs_root *root = BTRFS_I(inode)->root;
1716 : int ret = 0;
1717 : int skip_sum;
1718 : int metadata = 0;
1719 92080 : int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
1720 :
1721 92080 : skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
1722 :
1723 92080 : if (btrfs_is_free_space_inode(inode))
1724 : metadata = 2;
1725 :
1726 92080 : if (!(rw & REQ_WRITE)) {
1727 26396 : ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
1728 26396 : if (ret)
1729 : goto out;
1730 :
1731 26396 : if (bio_flags & EXTENT_BIO_COMPRESSED) {
1732 42 : ret = btrfs_submit_compressed_read(inode, bio,
1733 : mirror_num,
1734 : bio_flags);
1735 42 : goto out;
1736 26354 : } else if (!skip_sum) {
1737 26053 : ret = btrfs_lookup_bio_sums(root, inode, bio, NULL);
1738 26053 : if (ret)
1739 : goto out;
1740 : }
1741 : goto mapit;
1742 65684 : } else if (async && !skip_sum) {
1743 : /* csum items have already been cloned */
1744 59559 : if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
1745 : goto mapit;
1746 : /* we're doing a write, do the async checksumming */
1747 59547 : ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
1748 : inode, rw, bio, mirror_num,
1749 : bio_flags, bio_offset,
1750 : __btrfs_submit_bio_start,
1751 : __btrfs_submit_bio_done);
1752 59547 : goto out;
1753 6125 : } else if (!skip_sum) {
1754 1609 : ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
1755 1609 : if (ret)
1756 : goto out;
1757 : }
1758 :
1759 : mapit:
1760 32491 : ret = btrfs_map_bio(root, rw, bio, mirror_num, 0);
1761 :
1762 : out:
1763 92080 : if (ret < 0)
1764 0 : bio_endio(bio, ret);
1765 92080 : return ret;
1766 : }
1767 :
1768 : /*
1769 : * given a list of ordered sums record them in the inode. This happens
1770 : * at IO completion time based on sums calculated at bio submission time.
1771 : */
1772 51414 : static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
1773 : struct inode *inode, u64 file_offset,
1774 : struct list_head *list)
1775 : {
1776 : struct btrfs_ordered_sum *sum;
1777 :
1778 142297 : list_for_each_entry(sum, list, list) {
1779 90874 : trans->adding_csums = 1;
1780 90874 : btrfs_csum_file_blocks(trans,
1781 90874 : BTRFS_I(inode)->root->fs_info->csum_root, sum);
1782 90883 : trans->adding_csums = 0;
1783 : }
1784 51423 : return 0;
1785 : }
1786 :
1787 131791 : int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
1788 : struct extent_state **cached_state)
1789 : {
1790 131791 : WARN_ON((end & (PAGE_CACHE_SIZE - 1)) == 0);
1791 131791 : return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
1792 : cached_state, GFP_NOFS);
1793 : }
1794 :
1795 : /* see btrfs_writepage_start_hook for details on why this is required */
1796 : struct btrfs_writepage_fixup {
1797 : struct page *page;
1798 : struct btrfs_work work;
1799 : };
1800 :
1801 0 : static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
1802 : {
1803 : struct btrfs_writepage_fixup *fixup;
1804 : struct btrfs_ordered_extent *ordered;
1805 0 : struct extent_state *cached_state = NULL;
1806 0 : struct page *page;
1807 : struct inode *inode;
1808 : u64 page_start;
1809 : u64 page_end;
1810 : int ret;
1811 :
1812 0 : fixup = container_of(work, struct btrfs_writepage_fixup, work);
1813 0 : page = fixup->page;
1814 : again:
1815 0 : lock_page(page);
1816 0 : if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
1817 : ClearPageChecked(page);
1818 : goto out_page;
1819 : }
1820 :
1821 0 : inode = page->mapping->host;
1822 0 : page_start = page_offset(page);
1823 0 : page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
1824 :
1825 0 : lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0,
1826 : &cached_state);
1827 :
1828 : /* already ordered? We're done */
1829 0 : if (PagePrivate2(page))
1830 : goto out;
1831 :
1832 0 : ordered = btrfs_lookup_ordered_extent(inode, page_start);
1833 0 : if (ordered) {
1834 0 : unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
1835 : page_end, &cached_state, GFP_NOFS);
1836 0 : unlock_page(page);
1837 0 : btrfs_start_ordered_extent(inode, ordered, 1);
1838 0 : btrfs_put_ordered_extent(ordered);
1839 0 : goto again;
1840 : }
1841 :
1842 0 : ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
1843 0 : if (ret) {
1844 0 : mapping_set_error(page->mapping, ret);
1845 0 : end_extent_writepage(page, ret, page_start, page_end);
1846 : ClearPageChecked(page);
1847 : goto out;
1848 : }
1849 :
1850 0 : btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state);
1851 : ClearPageChecked(page);
1852 0 : set_page_dirty(page);
1853 : out:
1854 0 : unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
1855 : &cached_state, GFP_NOFS);
1856 : out_page:
1857 0 : unlock_page(page);
1858 0 : page_cache_release(page);
1859 0 : kfree(fixup);
1860 0 : }
1861 :
1862 : /*
1863 : * There are a few paths in the higher layers of the kernel that directly
1864 : * set the page dirty bit without asking the filesystem if it is a
1865 : * good idea. This causes problems because we want to make sure COW
1866 : * properly happens and the data=ordered rules are followed.
1867 : *
1868 : * In our case any range that doesn't have the ORDERED bit set
1869 : * hasn't been properly setup for IO. We kick off an async process
1870 : * to fix it up. The async helper will wait for ordered extents, set
1871 : * the delalloc bit and make it safe to write the page.
1872 : */
1873 1308900 : static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
1874 : {
1875 1308900 : struct inode *inode = page->mapping->host;
1876 : struct btrfs_writepage_fixup *fixup;
1877 1308900 : struct btrfs_root *root = BTRFS_I(inode)->root;
1878 :
1879 : /* this page is properly in the ordered list */
1880 1308918 : if (TestClearPagePrivate2(page))
1881 : return 0;
1882 :
1883 0 : if (PageChecked(page))
1884 : return -EAGAIN;
1885 :
1886 0 : fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
1887 0 : if (!fixup)
1888 : return -EAGAIN;
1889 :
1890 : SetPageChecked(page);
1891 0 : page_cache_get(page);
1892 0 : btrfs_init_work(&fixup->work, btrfs_fixup_helper,
1893 : btrfs_writepage_fixup_worker, NULL, NULL);
1894 0 : fixup->page = page;
1895 0 : btrfs_queue_work(root->fs_info->fixup_workers, &fixup->work);
1896 0 : return -EBUSY;
1897 : }
1898 :
1899 53371 : static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1900 : struct inode *inode, u64 file_pos,
1901 : u64 disk_bytenr, u64 disk_num_bytes,
1902 : u64 num_bytes, u64 ram_bytes,
1903 : u8 compression, u8 encryption,
1904 : u16 other_encoding, int extent_type)
1905 : {
1906 53371 : struct btrfs_root *root = BTRFS_I(inode)->root;
1907 : struct btrfs_file_extent_item *fi;
1908 : struct btrfs_path *path;
1909 : struct extent_buffer *leaf;
1910 : struct btrfs_key ins;
1911 53371 : int extent_inserted = 0;
1912 : int ret;
1913 :
1914 53371 : path = btrfs_alloc_path();
1915 53374 : if (!path)
1916 : return -ENOMEM;
1917 :
1918 : /*
1919 : * we may be replacing one extent in the tree with another.
1920 : * The new extent is pinned in the extent map, and we don't want
1921 : * to drop it from the cache until it is completely in the btree.
1922 : *
1923 : * So, tell btrfs_drop_extents to leave this extent in the cache.
1924 : * the caller is expected to unpin it and allow it to be merged
1925 : * with the others.
1926 : */
1927 53373 : ret = __btrfs_drop_extents(trans, root, inode, path, file_pos,
1928 : file_pos + num_bytes, NULL, 0,
1929 : 1, sizeof(*fi), &extent_inserted);
1930 53376 : if (ret)
1931 : goto out;
1932 :
1933 53375 : if (!extent_inserted) {
1934 5148 : ins.objectid = btrfs_ino(inode);
1935 5148 : ins.offset = file_pos;
1936 5148 : ins.type = BTRFS_EXTENT_DATA_KEY;
1937 :
1938 5148 : path->leave_spinning = 1;
1939 : ret = btrfs_insert_empty_item(trans, root, path, &ins,
1940 : sizeof(*fi));
1941 5149 : if (ret)
1942 : goto out;
1943 : }
1944 53376 : leaf = path->nodes[0];
1945 106751 : fi = btrfs_item_ptr(leaf, path->slots[0],
1946 : struct btrfs_file_extent_item);
1947 53375 : btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1948 53373 : btrfs_set_file_extent_type(leaf, fi, extent_type);
1949 : btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
1950 : btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
1951 : btrfs_set_file_extent_offset(leaf, fi, 0);
1952 : btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
1953 : btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
1954 : btrfs_set_file_extent_compression(leaf, fi, compression);
1955 : btrfs_set_file_extent_encryption(leaf, fi, encryption);
1956 : btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
1957 :
1958 53374 : btrfs_mark_buffer_dirty(leaf);
1959 53380 : btrfs_release_path(path);
1960 :
1961 53372 : inode_add_bytes(inode, num_bytes);
1962 :
1963 53378 : ins.objectid = disk_bytenr;
1964 53378 : ins.offset = disk_num_bytes;
1965 53378 : ins.type = BTRFS_EXTENT_ITEM_KEY;
1966 53378 : ret = btrfs_alloc_reserved_file_extent(trans, root,
1967 : root->root_key.objectid,
1968 : btrfs_ino(inode), file_pos, &ins);
1969 : out:
1970 53381 : btrfs_free_path(path);
1971 :
1972 53379 : return ret;
1973 : }
1974 :
1975 : /* snapshot-aware defrag */
1976 : struct sa_defrag_extent_backref {
1977 : struct rb_node node;
1978 : struct old_sa_defrag_extent *old;
1979 : u64 root_id;
1980 : u64 inum;
1981 : u64 file_pos;
1982 : u64 extent_offset;
1983 : u64 num_bytes;
1984 : u64 generation;
1985 : };
1986 :
1987 : struct old_sa_defrag_extent {
1988 : struct list_head list;
1989 : struct new_sa_defrag_extent *new;
1990 :
1991 : u64 extent_offset;
1992 : u64 bytenr;
1993 : u64 offset;
1994 : u64 len;
1995 : int count;
1996 : };
1997 :
1998 : struct new_sa_defrag_extent {
1999 : struct rb_root root;
2000 : struct list_head head;
2001 : struct btrfs_path *path;
2002 : struct inode *inode;
2003 : u64 file_pos;
2004 : u64 len;
2005 : u64 bytenr;
2006 : u64 disk_len;
2007 : u8 compress_type;
2008 : };
2009 :
2010 : static int backref_comp(struct sa_defrag_extent_backref *b1,
2011 : struct sa_defrag_extent_backref *b2)
2012 : {
2013 : if (b1->root_id < b2->root_id)
2014 : return -1;
2015 : else if (b1->root_id > b2->root_id)
2016 : return 1;
2017 :
2018 : if (b1->inum < b2->inum)
2019 : return -1;
2020 : else if (b1->inum > b2->inum)
2021 : return 1;
2022 :
2023 : if (b1->file_pos < b2->file_pos)
2024 : return -1;
2025 : else if (b1->file_pos > b2->file_pos)
2026 : return 1;
2027 :
2028 : /*
2029 : * [------------------------------] ===> (a range of space)
2030 : * |<--->| |<---->| =============> (fs/file tree A)
2031 : * |<---------------------------->| ===> (fs/file tree B)
2032 : *
2033 : * A range of space can refer to two file extents in one tree while
2034 : * refer to only one file extent in another tree.
2035 : *
2036 : * So we may process a disk offset more than one time(two extents in A)
2037 : * and locate at the same extent(one extent in B), then insert two same
2038 : * backrefs(both refer to the extent in B).
2039 : */
2040 : return 0;
2041 : }
2042 :
2043 : static void backref_insert(struct rb_root *root,
2044 : struct sa_defrag_extent_backref *backref)
2045 : {
2046 : struct rb_node **p = &root->rb_node;
2047 : struct rb_node *parent = NULL;
2048 : struct sa_defrag_extent_backref *entry;
2049 : int ret;
2050 :
2051 : while (*p) {
2052 : parent = *p;
2053 : entry = rb_entry(parent, struct sa_defrag_extent_backref, node);
2054 :
2055 : ret = backref_comp(backref, entry);
2056 : if (ret < 0)
2057 : p = &(*p)->rb_left;
2058 : else
2059 : p = &(*p)->rb_right;
2060 : }
2061 :
2062 : rb_link_node(&backref->node, parent, p);
2063 : rb_insert_color(&backref->node, root);
2064 : }
2065 :
2066 : /*
2067 : * Note the backref might has changed, and in this case we just return 0.
2068 : */
2069 : static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
2070 : void *ctx)
2071 : {
2072 : struct btrfs_file_extent_item *extent;
2073 : struct btrfs_fs_info *fs_info;
2074 : struct old_sa_defrag_extent *old = ctx;
2075 : struct new_sa_defrag_extent *new = old->new;
2076 : struct btrfs_path *path = new->path;
2077 : struct btrfs_key key;
2078 : struct btrfs_root *root;
2079 : struct sa_defrag_extent_backref *backref;
2080 : struct extent_buffer *leaf;
2081 : struct inode *inode = new->inode;
2082 : int slot;
2083 : int ret;
2084 : u64 extent_offset;
2085 : u64 num_bytes;
2086 :
2087 : if (BTRFS_I(inode)->root->root_key.objectid == root_id &&
2088 : inum == btrfs_ino(inode))
2089 : return 0;
2090 :
2091 : key.objectid = root_id;
2092 : key.type = BTRFS_ROOT_ITEM_KEY;
2093 : key.offset = (u64)-1;
2094 :
2095 : fs_info = BTRFS_I(inode)->root->fs_info;
2096 : root = btrfs_read_fs_root_no_name(fs_info, &key);
2097 : if (IS_ERR(root)) {
2098 : if (PTR_ERR(root) == -ENOENT)
2099 : return 0;
2100 : WARN_ON(1);
2101 : pr_debug("inum=%llu, offset=%llu, root_id=%llu\n",
2102 : inum, offset, root_id);
2103 : return PTR_ERR(root);
2104 : }
2105 :
2106 : key.objectid = inum;
2107 : key.type = BTRFS_EXTENT_DATA_KEY;
2108 : if (offset > (u64)-1 << 32)
2109 : key.offset = 0;
2110 : else
2111 : key.offset = offset;
2112 :
2113 : ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2114 : if (WARN_ON(ret < 0))
2115 : return ret;
2116 : ret = 0;
2117 :
2118 : while (1) {
2119 : cond_resched();
2120 :
2121 : leaf = path->nodes[0];
2122 : slot = path->slots[0];
2123 :
2124 : if (slot >= btrfs_header_nritems(leaf)) {
2125 : ret = btrfs_next_leaf(root, path);
2126 : if (ret < 0) {
2127 : goto out;
2128 : } else if (ret > 0) {
2129 : ret = 0;
2130 : goto out;
2131 : }
2132 : continue;
2133 : }
2134 :
2135 : path->slots[0]++;
2136 :
2137 : btrfs_item_key_to_cpu(leaf, &key, slot);
2138 :
2139 : if (key.objectid > inum)
2140 : goto out;
2141 :
2142 : if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY)
2143 : continue;
2144 :
2145 : extent = btrfs_item_ptr(leaf, slot,
2146 : struct btrfs_file_extent_item);
2147 :
2148 : if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr)
2149 : continue;
2150 :
2151 : /*
2152 : * 'offset' refers to the exact key.offset,
2153 : * NOT the 'offset' field in btrfs_extent_data_ref, ie.
2154 : * (key.offset - extent_offset).
2155 : */
2156 : if (key.offset != offset)
2157 : continue;
2158 :
2159 : extent_offset = btrfs_file_extent_offset(leaf, extent);
2160 : num_bytes = btrfs_file_extent_num_bytes(leaf, extent);
2161 :
2162 : if (extent_offset >= old->extent_offset + old->offset +
2163 : old->len || extent_offset + num_bytes <=
2164 : old->extent_offset + old->offset)
2165 : continue;
2166 : break;
2167 : }
2168 :
2169 : backref = kmalloc(sizeof(*backref), GFP_NOFS);
2170 : if (!backref) {
2171 : ret = -ENOENT;
2172 : goto out;
2173 : }
2174 :
2175 : backref->root_id = root_id;
2176 : backref->inum = inum;
2177 : backref->file_pos = offset;
2178 : backref->num_bytes = num_bytes;
2179 : backref->extent_offset = extent_offset;
2180 : backref->generation = btrfs_file_extent_generation(leaf, extent);
2181 : backref->old = old;
2182 : backref_insert(&new->root, backref);
2183 : old->count++;
2184 : out:
2185 : btrfs_release_path(path);
2186 : WARN_ON(ret);
2187 : return ret;
2188 : }
2189 :
2190 : static noinline bool record_extent_backrefs(struct btrfs_path *path,
2191 : struct new_sa_defrag_extent *new)
2192 : {
2193 : struct btrfs_fs_info *fs_info = BTRFS_I(new->inode)->root->fs_info;
2194 : struct old_sa_defrag_extent *old, *tmp;
2195 : int ret;
2196 :
2197 : new->path = path;
2198 :
2199 : list_for_each_entry_safe(old, tmp, &new->head, list) {
2200 : ret = iterate_inodes_from_logical(old->bytenr +
2201 : old->extent_offset, fs_info,
2202 : path, record_one_backref,
2203 : old);
2204 : if (ret < 0 && ret != -ENOENT)
2205 : return false;
2206 :
2207 : /* no backref to be processed for this extent */
2208 : if (!old->count) {
2209 : list_del(&old->list);
2210 : kfree(old);
2211 : }
2212 : }
2213 :
2214 : if (list_empty(&new->head))
2215 : return false;
2216 :
2217 : return true;
2218 : }
2219 :
2220 : static int relink_is_mergable(struct extent_buffer *leaf,
2221 : struct btrfs_file_extent_item *fi,
2222 : struct new_sa_defrag_extent *new)
2223 : {
2224 : if (btrfs_file_extent_disk_bytenr(leaf, fi) != new->bytenr)
2225 : return 0;
2226 :
2227 : if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
2228 : return 0;
2229 :
2230 : if (btrfs_file_extent_compression(leaf, fi) != new->compress_type)
2231 : return 0;
2232 :
2233 : if (btrfs_file_extent_encryption(leaf, fi) ||
2234 : btrfs_file_extent_other_encoding(leaf, fi))
2235 : return 0;
2236 :
2237 : return 1;
2238 : }
2239 :
2240 : /*
2241 : * Note the backref might has changed, and in this case we just return 0.
2242 : */
2243 : static noinline int relink_extent_backref(struct btrfs_path *path,
2244 : struct sa_defrag_extent_backref *prev,
2245 : struct sa_defrag_extent_backref *backref)
2246 : {
2247 : struct btrfs_file_extent_item *extent;
2248 : struct btrfs_file_extent_item *item;
2249 : struct btrfs_ordered_extent *ordered;
2250 : struct btrfs_trans_handle *trans;
2251 : struct btrfs_fs_info *fs_info;
2252 : struct btrfs_root *root;
2253 : struct btrfs_key key;
2254 : struct extent_buffer *leaf;
2255 : struct old_sa_defrag_extent *old = backref->old;
2256 : struct new_sa_defrag_extent *new = old->new;
2257 : struct inode *src_inode = new->inode;
2258 : struct inode *inode;
2259 : struct extent_state *cached = NULL;
2260 : int ret = 0;
2261 : u64 start;
2262 : u64 len;
2263 : u64 lock_start;
2264 : u64 lock_end;
2265 : bool merge = false;
2266 : int index;
2267 :
2268 : if (prev && prev->root_id == backref->root_id &&
2269 : prev->inum == backref->inum &&
2270 : prev->file_pos + prev->num_bytes == backref->file_pos)
2271 : merge = true;
2272 :
2273 : /* step 1: get root */
2274 : key.objectid = backref->root_id;
2275 : key.type = BTRFS_ROOT_ITEM_KEY;
2276 : key.offset = (u64)-1;
2277 :
2278 : fs_info = BTRFS_I(src_inode)->root->fs_info;
2279 : index = srcu_read_lock(&fs_info->subvol_srcu);
2280 :
2281 : root = btrfs_read_fs_root_no_name(fs_info, &key);
2282 : if (IS_ERR(root)) {
2283 : srcu_read_unlock(&fs_info->subvol_srcu, index);
2284 : if (PTR_ERR(root) == -ENOENT)
2285 : return 0;
2286 : return PTR_ERR(root);
2287 : }
2288 :
2289 : if (btrfs_root_readonly(root)) {
2290 : srcu_read_unlock(&fs_info->subvol_srcu, index);
2291 : return 0;
2292 : }
2293 :
2294 : /* step 2: get inode */
2295 : key.objectid = backref->inum;
2296 : key.type = BTRFS_INODE_ITEM_KEY;
2297 : key.offset = 0;
2298 :
2299 : inode = btrfs_iget(fs_info->sb, &key, root, NULL);
2300 : if (IS_ERR(inode)) {
2301 : srcu_read_unlock(&fs_info->subvol_srcu, index);
2302 : return 0;
2303 : }
2304 :
2305 : srcu_read_unlock(&fs_info->subvol_srcu, index);
2306 :
2307 : /* step 3: relink backref */
2308 : lock_start = backref->file_pos;
2309 : lock_end = backref->file_pos + backref->num_bytes - 1;
2310 : lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2311 : 0, &cached);
2312 :
2313 : ordered = btrfs_lookup_first_ordered_extent(inode, lock_end);
2314 : if (ordered) {
2315 : btrfs_put_ordered_extent(ordered);
2316 : goto out_unlock;
2317 : }
2318 :
2319 : trans = btrfs_join_transaction(root);
2320 : if (IS_ERR(trans)) {
2321 : ret = PTR_ERR(trans);
2322 : goto out_unlock;
2323 : }
2324 :
2325 : key.objectid = backref->inum;
2326 : key.type = BTRFS_EXTENT_DATA_KEY;
2327 : key.offset = backref->file_pos;
2328 :
2329 : ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2330 : if (ret < 0) {
2331 : goto out_free_path;
2332 : } else if (ret > 0) {
2333 : ret = 0;
2334 : goto out_free_path;
2335 : }
2336 :
2337 : extent = btrfs_item_ptr(path->nodes[0], path->slots[0],
2338 : struct btrfs_file_extent_item);
2339 :
2340 : if (btrfs_file_extent_generation(path->nodes[0], extent) !=
2341 : backref->generation)
2342 : goto out_free_path;
2343 :
2344 : btrfs_release_path(path);
2345 :
2346 : start = backref->file_pos;
2347 : if (backref->extent_offset < old->extent_offset + old->offset)
2348 : start += old->extent_offset + old->offset -
2349 : backref->extent_offset;
2350 :
2351 : len = min(backref->extent_offset + backref->num_bytes,
2352 : old->extent_offset + old->offset + old->len);
2353 : len -= max(backref->extent_offset, old->extent_offset + old->offset);
2354 :
2355 : ret = btrfs_drop_extents(trans, root, inode, start,
2356 : start + len, 1);
2357 : if (ret)
2358 : goto out_free_path;
2359 : again:
2360 : key.objectid = btrfs_ino(inode);
2361 : key.type = BTRFS_EXTENT_DATA_KEY;
2362 : key.offset = start;
2363 :
2364 : path->leave_spinning = 1;
2365 : if (merge) {
2366 : struct btrfs_file_extent_item *fi;
2367 : u64 extent_len;
2368 : struct btrfs_key found_key;
2369 :
2370 : ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2371 : if (ret < 0)
2372 : goto out_free_path;
2373 :
2374 : path->slots[0]--;
2375 : leaf = path->nodes[0];
2376 : btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2377 :
2378 : fi = btrfs_item_ptr(leaf, path->slots[0],
2379 : struct btrfs_file_extent_item);
2380 : extent_len = btrfs_file_extent_num_bytes(leaf, fi);
2381 :
2382 : if (extent_len + found_key.offset == start &&
2383 : relink_is_mergable(leaf, fi, new)) {
2384 : btrfs_set_file_extent_num_bytes(leaf, fi,
2385 : extent_len + len);
2386 : btrfs_mark_buffer_dirty(leaf);
2387 : inode_add_bytes(inode, len);
2388 :
2389 : ret = 1;
2390 : goto out_free_path;
2391 : } else {
2392 : merge = false;
2393 : btrfs_release_path(path);
2394 : goto again;
2395 : }
2396 : }
2397 :
2398 : ret = btrfs_insert_empty_item(trans, root, path, &key,
2399 : sizeof(*extent));
2400 : if (ret) {
2401 : btrfs_abort_transaction(trans, root, ret);
2402 : goto out_free_path;
2403 : }
2404 :
2405 : leaf = path->nodes[0];
2406 : item = btrfs_item_ptr(leaf, path->slots[0],
2407 : struct btrfs_file_extent_item);
2408 : btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr);
2409 : btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len);
2410 : btrfs_set_file_extent_offset(leaf, item, start - new->file_pos);
2411 : btrfs_set_file_extent_num_bytes(leaf, item, len);
2412 : btrfs_set_file_extent_ram_bytes(leaf, item, new->len);
2413 : btrfs_set_file_extent_generation(leaf, item, trans->transid);
2414 : btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
2415 : btrfs_set_file_extent_compression(leaf, item, new->compress_type);
2416 : btrfs_set_file_extent_encryption(leaf, item, 0);
2417 : btrfs_set_file_extent_other_encoding(leaf, item, 0);
2418 :
2419 : btrfs_mark_buffer_dirty(leaf);
2420 : inode_add_bytes(inode, len);
2421 : btrfs_release_path(path);
2422 :
2423 : ret = btrfs_inc_extent_ref(trans, root, new->bytenr,
2424 : new->disk_len, 0,
2425 : backref->root_id, backref->inum,
2426 : new->file_pos, 0); /* start - extent_offset */
2427 : if (ret) {
2428 : btrfs_abort_transaction(trans, root, ret);
2429 : goto out_free_path;
2430 : }
2431 :
2432 : ret = 1;
2433 : out_free_path:
2434 : btrfs_release_path(path);
2435 : path->leave_spinning = 0;
2436 : btrfs_end_transaction(trans, root);
2437 : out_unlock:
2438 : unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2439 : &cached, GFP_NOFS);
2440 : iput(inode);
2441 : return ret;
2442 : }
2443 :
2444 : static void free_sa_defrag_extent(struct new_sa_defrag_extent *new)
2445 : {
2446 : struct old_sa_defrag_extent *old, *tmp;
2447 :
2448 : if (!new)
2449 : return;
2450 :
2451 : list_for_each_entry_safe(old, tmp, &new->head, list) {
2452 : list_del(&old->list);
2453 : kfree(old);
2454 : }
2455 : kfree(new);
2456 : }
2457 :
2458 : static void relink_file_extents(struct new_sa_defrag_extent *new)
2459 : {
2460 : struct btrfs_path *path;
2461 : struct sa_defrag_extent_backref *backref;
2462 : struct sa_defrag_extent_backref *prev = NULL;
2463 : struct inode *inode;
2464 : struct btrfs_root *root;
2465 : struct rb_node *node;
2466 : int ret;
2467 :
2468 : inode = new->inode;
2469 : root = BTRFS_I(inode)->root;
2470 :
2471 : path = btrfs_alloc_path();
2472 : if (!path)
2473 : return;
2474 :
2475 : if (!record_extent_backrefs(path, new)) {
2476 : btrfs_free_path(path);
2477 : goto out;
2478 : }
2479 : btrfs_release_path(path);
2480 :
2481 : while (1) {
2482 : node = rb_first(&new->root);
2483 : if (!node)
2484 : break;
2485 : rb_erase(node, &new->root);
2486 :
2487 : backref = rb_entry(node, struct sa_defrag_extent_backref, node);
2488 :
2489 : ret = relink_extent_backref(path, prev, backref);
2490 : WARN_ON(ret < 0);
2491 :
2492 : kfree(prev);
2493 :
2494 : if (ret == 1)
2495 : prev = backref;
2496 : else
2497 : prev = NULL;
2498 : cond_resched();
2499 : }
2500 : kfree(prev);
2501 :
2502 : btrfs_free_path(path);
2503 : out:
2504 : free_sa_defrag_extent(new);
2505 :
2506 : atomic_dec(&root->fs_info->defrag_running);
2507 : wake_up(&root->fs_info->transaction_wait);
2508 : }
2509 :
2510 : static struct new_sa_defrag_extent *
2511 : record_old_file_extents(struct inode *inode,
2512 : struct btrfs_ordered_extent *ordered)
2513 : {
2514 : struct btrfs_root *root = BTRFS_I(inode)->root;
2515 : struct btrfs_path *path;
2516 : struct btrfs_key key;
2517 : struct old_sa_defrag_extent *old;
2518 : struct new_sa_defrag_extent *new;
2519 : int ret;
2520 :
2521 : new = kmalloc(sizeof(*new), GFP_NOFS);
2522 : if (!new)
2523 : return NULL;
2524 :
2525 : new->inode = inode;
2526 : new->file_pos = ordered->file_offset;
2527 : new->len = ordered->len;
2528 : new->bytenr = ordered->start;
2529 : new->disk_len = ordered->disk_len;
2530 : new->compress_type = ordered->compress_type;
2531 : new->root = RB_ROOT;
2532 : INIT_LIST_HEAD(&new->head);
2533 :
2534 : path = btrfs_alloc_path();
2535 : if (!path)
2536 : goto out_kfree;
2537 :
2538 : key.objectid = btrfs_ino(inode);
2539 : key.type = BTRFS_EXTENT_DATA_KEY;
2540 : key.offset = new->file_pos;
2541 :
2542 : ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2543 : if (ret < 0)
2544 : goto out_free_path;
2545 : if (ret > 0 && path->slots[0] > 0)
2546 : path->slots[0]--;
2547 :
2548 : /* find out all the old extents for the file range */
2549 : while (1) {
2550 : struct btrfs_file_extent_item *extent;
2551 : struct extent_buffer *l;
2552 : int slot;
2553 : u64 num_bytes;
2554 : u64 offset;
2555 : u64 end;
2556 : u64 disk_bytenr;
2557 : u64 extent_offset;
2558 :
2559 : l = path->nodes[0];
2560 : slot = path->slots[0];
2561 :
2562 : if (slot >= btrfs_header_nritems(l)) {
2563 : ret = btrfs_next_leaf(root, path);
2564 : if (ret < 0)
2565 : goto out_free_path;
2566 : else if (ret > 0)
2567 : break;
2568 : continue;
2569 : }
2570 :
2571 : btrfs_item_key_to_cpu(l, &key, slot);
2572 :
2573 : if (key.objectid != btrfs_ino(inode))
2574 : break;
2575 : if (key.type != BTRFS_EXTENT_DATA_KEY)
2576 : break;
2577 : if (key.offset >= new->file_pos + new->len)
2578 : break;
2579 :
2580 : extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item);
2581 :
2582 : num_bytes = btrfs_file_extent_num_bytes(l, extent);
2583 : if (key.offset + num_bytes < new->file_pos)
2584 : goto next;
2585 :
2586 : disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent);
2587 : if (!disk_bytenr)
2588 : goto next;
2589 :
2590 : extent_offset = btrfs_file_extent_offset(l, extent);
2591 :
2592 : old = kmalloc(sizeof(*old), GFP_NOFS);
2593 : if (!old)
2594 : goto out_free_path;
2595 :
2596 : offset = max(new->file_pos, key.offset);
2597 : end = min(new->file_pos + new->len, key.offset + num_bytes);
2598 :
2599 : old->bytenr = disk_bytenr;
2600 : old->extent_offset = extent_offset;
2601 : old->offset = offset - key.offset;
2602 : old->len = end - offset;
2603 : old->new = new;
2604 : old->count = 0;
2605 : list_add_tail(&old->list, &new->head);
2606 : next:
2607 : path->slots[0]++;
2608 : cond_resched();
2609 : }
2610 :
2611 : btrfs_free_path(path);
2612 : atomic_inc(&root->fs_info->defrag_running);
2613 :
2614 : return new;
2615 :
2616 : out_free_path:
2617 : btrfs_free_path(path);
2618 : out_kfree:
2619 : free_sa_defrag_extent(new);
2620 : return NULL;
2621 : }
2622 :
2623 45797 : static void btrfs_release_delalloc_bytes(struct btrfs_root *root,
2624 : u64 start, u64 len)
2625 : {
2626 : struct btrfs_block_group_cache *cache;
2627 :
2628 45797 : cache = btrfs_lookup_block_group(root->fs_info, start);
2629 : ASSERT(cache);
2630 :
2631 : spin_lock(&cache->lock);
2632 45798 : cache->delalloc_bytes -= len;
2633 : spin_unlock(&cache->lock);
2634 :
2635 45798 : btrfs_put_block_group(cache);
2636 45797 : }
2637 :
2638 : /* as ordered data IO finishes, this gets called so we can finish
2639 : * an ordered extent if the range of bytes in the file it covers are
2640 : * fully written.
2641 : */
2642 51505 : static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
2643 : {
2644 51505 : struct inode *inode = ordered_extent->inode;
2645 97302 : struct btrfs_root *root = BTRFS_I(inode)->root;
2646 : struct btrfs_trans_handle *trans = NULL;
2647 51505 : struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2648 51505 : struct extent_state *cached_state = NULL;
2649 : struct new_sa_defrag_extent *new = NULL;
2650 : int compress_type = 0;
2651 : int ret = 0;
2652 51505 : u64 logical_len = ordered_extent->len;
2653 : bool nolock;
2654 : bool truncated = false;
2655 :
2656 51505 : nolock = btrfs_is_free_space_inode(inode);
2657 :
2658 51505 : if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
2659 : ret = -EIO;
2660 : goto out;
2661 : }
2662 :
2663 51505 : if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
2664 : truncated = true;
2665 0 : logical_len = ordered_extent->truncated_len;
2666 : /* Truncated the entire extent, don't bother adding */
2667 0 : if (!logical_len)
2668 : goto out;
2669 : }
2670 :
2671 51505 : if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
2672 164 : BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
2673 82 : btrfs_ordered_update_i_size(inode, 0, ordered_extent);
2674 82 : if (nolock)
2675 82 : trans = btrfs_join_transaction_nolock(root);
2676 : else
2677 0 : trans = btrfs_join_transaction(root);
2678 82 : if (IS_ERR(trans)) {
2679 0 : ret = PTR_ERR(trans);
2680 : trans = NULL;
2681 0 : goto out;
2682 : }
2683 82 : trans->block_rsv = &root->fs_info->delalloc_block_rsv;
2684 82 : ret = btrfs_update_inode_fallback(trans, root, inode);
2685 82 : if (ret) /* -ENOMEM or corruption */
2686 0 : btrfs_abort_transaction(trans, root, ret);
2687 : goto out;
2688 : }
2689 :
2690 51423 : lock_extent_bits(io_tree, ordered_extent->file_offset,
2691 51423 : ordered_extent->file_offset + ordered_extent->len - 1,
2692 : 0, &cached_state);
2693 :
2694 102830 : ret = test_range_bit(io_tree, ordered_extent->file_offset,
2695 51415 : ordered_extent->file_offset + ordered_extent->len - 1,
2696 : EXTENT_DEFRAG, 1, cached_state);
2697 51419 : if (ret) {
2698 : u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
2699 : if (0 && last_snapshot >= BTRFS_I(inode)->generation)
2700 : /* the inode is shared */
2701 : new = record_old_file_extents(inode, ordered_extent);
2702 :
2703 66 : clear_extent_bit(io_tree, ordered_extent->file_offset,
2704 66 : ordered_extent->file_offset + ordered_extent->len - 1,
2705 : EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS);
2706 : }
2707 :
2708 51419 : if (nolock)
2709 3975 : trans = btrfs_join_transaction_nolock(root);
2710 : else
2711 47444 : trans = btrfs_join_transaction(root);
2712 51417 : if (IS_ERR(trans)) {
2713 0 : ret = PTR_ERR(trans);
2714 : trans = NULL;
2715 0 : goto out_unlock;
2716 : }
2717 :
2718 51417 : trans->block_rsv = &root->fs_info->delalloc_block_rsv;
2719 :
2720 51417 : if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
2721 153 : compress_type = ordered_extent->compress_type;
2722 51417 : if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
2723 5625 : BUG_ON(compress_type);
2724 5625 : ret = btrfs_mark_extent_written(trans, inode,
2725 : ordered_extent->file_offset,
2726 5625 : ordered_extent->file_offset +
2727 : logical_len);
2728 : } else {
2729 45792 : BUG_ON(root == root->fs_info->tree_root);
2730 45792 : ret = insert_reserved_file_extent(trans, inode,
2731 : ordered_extent->file_offset,
2732 : ordered_extent->start,
2733 : ordered_extent->disk_len,
2734 : logical_len, logical_len,
2735 : compress_type, 0, 0,
2736 : BTRFS_FILE_EXTENT_REG);
2737 45797 : if (!ret)
2738 91594 : btrfs_release_delalloc_bytes(root,
2739 : ordered_extent->start,
2740 : ordered_extent->disk_len);
2741 : }
2742 51422 : unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
2743 : ordered_extent->file_offset, ordered_extent->len,
2744 : trans->transid);
2745 51421 : if (ret < 0) {
2746 0 : btrfs_abort_transaction(trans, root, ret);
2747 0 : goto out_unlock;
2748 : }
2749 :
2750 51421 : add_pending_csums(trans, inode, ordered_extent->file_offset,
2751 : &ordered_extent->list);
2752 :
2753 51423 : btrfs_ordered_update_i_size(inode, 0, ordered_extent);
2754 51423 : ret = btrfs_update_inode_fallback(trans, root, inode);
2755 51422 : if (ret) { /* -ENOMEM or corruption */
2756 0 : btrfs_abort_transaction(trans, root, ret);
2757 0 : goto out_unlock;
2758 : }
2759 : ret = 0;
2760 : out_unlock:
2761 51422 : unlock_extent_cached(io_tree, ordered_extent->file_offset,
2762 102844 : ordered_extent->file_offset +
2763 51422 : ordered_extent->len - 1, &cached_state, GFP_NOFS);
2764 : out:
2765 51504 : if (root != root->fs_info->tree_root)
2766 47447 : btrfs_delalloc_release_metadata(inode, ordered_extent->len);
2767 51505 : if (trans)
2768 51505 : btrfs_end_transaction(trans, root);
2769 :
2770 51505 : if (ret || truncated) {
2771 : u64 start, end;
2772 :
2773 0 : if (truncated)
2774 0 : start = ordered_extent->file_offset + logical_len;
2775 : else
2776 0 : start = ordered_extent->file_offset;
2777 0 : end = ordered_extent->file_offset + ordered_extent->len - 1;
2778 0 : clear_extent_uptodate(io_tree, start, end, NULL, GFP_NOFS);
2779 :
2780 : /* Drop the cache for the part of the extent we didn't write. */
2781 0 : btrfs_drop_extent_cache(inode, start, end, 0);
2782 :
2783 : /*
2784 : * If the ordered extent had an IOERR or something else went
2785 : * wrong we need to return the space for this ordered extent
2786 : * back to the allocator. We only free the extent in the
2787 : * truncated case if we didn't write out the extent at all.
2788 : */
2789 0 : if ((ret || !logical_len) &&
2790 0 : !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
2791 : !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
2792 0 : btrfs_free_reserved_extent(root, ordered_extent->start,
2793 : ordered_extent->disk_len, 1);
2794 : }
2795 :
2796 :
2797 : /*
2798 : * This needs to be done to make sure anybody waiting knows we are done
2799 : * updating everything for this ordered extent.
2800 : */
2801 51505 : btrfs_remove_ordered_extent(inode, ordered_extent);
2802 :
2803 : /* for snapshot-aware defrag */
2804 : if (new) {
2805 : if (ret) {
2806 : free_sa_defrag_extent(new);
2807 : atomic_dec(&root->fs_info->defrag_running);
2808 : } else {
2809 : relink_file_extents(new);
2810 : }
2811 : }
2812 :
2813 : /* once for us */
2814 51505 : btrfs_put_ordered_extent(ordered_extent);
2815 : /* once for the tree */
2816 51505 : btrfs_put_ordered_extent(ordered_extent);
2817 :
2818 51505 : return ret;
2819 : }
2820 :
2821 51505 : static void finish_ordered_fn(struct btrfs_work *work)
2822 : {
2823 : struct btrfs_ordered_extent *ordered_extent;
2824 51505 : ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
2825 51505 : btrfs_finish_ordered_io(ordered_extent);
2826 51505 : }
2827 :
2828 1309032 : static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
2829 : struct extent_state *state, int uptodate)
2830 : {
2831 1309032 : struct inode *inode = page->mapping->host;
2832 1309032 : struct btrfs_root *root = BTRFS_I(inode)->root;
2833 1309032 : struct btrfs_ordered_extent *ordered_extent = NULL;
2834 : struct btrfs_workqueue *wq;
2835 : btrfs_work_func_t func;
2836 :
2837 1309032 : trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
2838 :
2839 : ClearPagePrivate2(page);
2840 1309063 : if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
2841 1309063 : end - start + 1, uptodate))
2842 : return 0;
2843 :
2844 26245 : if (btrfs_is_free_space_inode(inode)) {
2845 4057 : wq = root->fs_info->endio_freespace_worker;
2846 : func = btrfs_freespace_write_helper;
2847 : } else {
2848 22188 : wq = root->fs_info->endio_write_workers;
2849 : func = btrfs_endio_write_helper;
2850 : }
2851 :
2852 26245 : btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL,
2853 : NULL);
2854 26245 : btrfs_queue_work(wq, &ordered_extent->work);
2855 :
2856 26245 : return 0;
2857 : }
2858 :
2859 : /*
2860 : * when reads are done, we need to check csums to verify the data is correct
2861 : * if there's a match, we allow the bio to finish. If not, the code in
2862 : * extent_io.c will try to find good copies for us.
2863 : */
2864 72320 : static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
2865 72320 : u64 phy_offset, struct page *page,
2866 : u64 start, u64 end, int mirror)
2867 : {
2868 72320 : size_t offset = start - page_offset(page);
2869 72320 : struct inode *inode = page->mapping->host;
2870 72320 : struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2871 : char *kaddr;
2872 72320 : struct btrfs_root *root = BTRFS_I(inode)->root;
2873 : u32 csum_expected;
2874 72320 : u32 csum = ~(u32)0;
2875 : static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
2876 : DEFAULT_RATELIMIT_BURST);
2877 :
2878 72320 : if (PageChecked(page)) {
2879 : ClearPageChecked(page);
2880 : goto good;
2881 : }
2882 :
2883 71988 : if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
2884 : goto good;
2885 :
2886 63248 : if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
2887 1276 : test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
2888 254 : clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM,
2889 : GFP_NOFS);
2890 254 : return 0;
2891 : }
2892 :
2893 61718 : phy_offset >>= inode->i_sb->s_blocksize_bits;
2894 61718 : csum_expected = *(((u32 *)io_bio->csum) + phy_offset);
2895 :
2896 : kaddr = kmap_atomic(page);
2897 61711 : csum = btrfs_csum_data(kaddr + offset, csum, end - start + 1);
2898 61658 : btrfs_csum_final(csum, (char *)&csum);
2899 61659 : if (csum != csum_expected)
2900 : goto zeroit;
2901 :
2902 : kunmap_atomic(kaddr);
2903 : good:
2904 : return 0;
2905 :
2906 : zeroit:
2907 0 : if (__ratelimit(&_rs))
2908 0 : btrfs_info(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u",
2909 : btrfs_ino(page->mapping->host), start, csum, csum_expected);
2910 0 : memset(kaddr + offset, 1, end - start + 1);
2911 : flush_dcache_page(page);
2912 : kunmap_atomic(kaddr);
2913 0 : if (csum_expected == 0)
2914 : return 0;
2915 0 : return -EIO;
2916 : }
2917 :
2918 : struct delayed_iput {
2919 : struct list_head list;
2920 : struct inode *inode;
2921 : };
2922 :
2923 : /* JDM: If this is fs-wide, why can't we add a pointer to
2924 : * btrfs_inode instead and avoid the allocation? */
2925 86695 : void btrfs_add_delayed_iput(struct inode *inode)
2926 : {
2927 86695 : struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
2928 : struct delayed_iput *delayed;
2929 :
2930 86702 : if (atomic_add_unless(&inode->i_count, -1, 1))
2931 86702 : return;
2932 :
2933 : delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL);
2934 581 : delayed->inode = inode;
2935 :
2936 : spin_lock(&fs_info->delayed_iput_lock);
2937 581 : list_add_tail(&delayed->list, &fs_info->delayed_iputs);
2938 : spin_unlock(&fs_info->delayed_iput_lock);
2939 : }
2940 :
2941 6133 : void btrfs_run_delayed_iputs(struct btrfs_root *root)
2942 : {
2943 6133 : LIST_HEAD(list);
2944 6133 : struct btrfs_fs_info *fs_info = root->fs_info;
2945 : struct delayed_iput *delayed;
2946 : int empty;
2947 :
2948 : spin_lock(&fs_info->delayed_iput_lock);
2949 6133 : empty = list_empty(&fs_info->delayed_iputs);
2950 : spin_unlock(&fs_info->delayed_iput_lock);
2951 6133 : if (empty)
2952 5961 : return;
2953 :
2954 : spin_lock(&fs_info->delayed_iput_lock);
2955 : list_splice_init(&fs_info->delayed_iputs, &list);
2956 : spin_unlock(&fs_info->delayed_iput_lock);
2957 :
2958 753 : while (!list_empty(&list)) {
2959 : delayed = list_entry(list.next, struct delayed_iput, list);
2960 581 : list_del(&delayed->list);
2961 581 : iput(delayed->inode);
2962 581 : kfree(delayed);
2963 : }
2964 : }
2965 :
2966 : /*
2967 : * This is called in transaction commit time. If there are no orphan
2968 : * files in the subvolume, it removes orphan item and frees block_rsv
2969 : * structure.
2970 : */
2971 2548 : void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
2972 : struct btrfs_root *root)
2973 : {
2974 : struct btrfs_block_rsv *block_rsv;
2975 : int ret;
2976 :
2977 4858 : if (atomic_read(&root->orphan_inodes) ||
2978 2310 : root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
2979 : return;
2980 :
2981 : spin_lock(&root->orphan_lock);
2982 2259 : if (atomic_read(&root->orphan_inodes)) {
2983 : spin_unlock(&root->orphan_lock);
2984 : return;
2985 : }
2986 :
2987 2259 : if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) {
2988 : spin_unlock(&root->orphan_lock);
2989 : return;
2990 : }
2991 :
2992 2259 : block_rsv = root->orphan_block_rsv;
2993 2259 : root->orphan_block_rsv = NULL;
2994 : spin_unlock(&root->orphan_lock);
2995 :
2996 3063 : if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state) &&
2997 : btrfs_root_refs(&root->root_item) > 0) {
2998 0 : ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
2999 : root->root_key.objectid);
3000 0 : if (ret)
3001 0 : btrfs_abort_transaction(trans, root, ret);
3002 : else
3003 : clear_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
3004 : &root->state);
3005 : }
3006 :
3007 2259 : if (block_rsv) {
3008 511 : WARN_ON(block_rsv->size > 0);
3009 511 : btrfs_free_block_rsv(root, block_rsv);
3010 : }
3011 : }
3012 :
3013 : /*
3014 : * This creates an orphan entry for the given inode in case something goes
3015 : * wrong in the middle of an unlink/truncate.
3016 : *
3017 : * NOTE: caller of this function should reserve 5 units of metadata for
3018 : * this function.
3019 : */
3020 9630 : int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
3021 : {
3022 9630 : struct btrfs_root *root = BTRFS_I(inode)->root;
3023 : struct btrfs_block_rsv *block_rsv = NULL;
3024 : int reserve = 0;
3025 : int insert = 0;
3026 : int ret;
3027 :
3028 9630 : if (!root->orphan_block_rsv) {
3029 513 : block_rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
3030 513 : if (!block_rsv)
3031 : return -ENOMEM;
3032 : }
3033 :
3034 : spin_lock(&root->orphan_lock);
3035 9630 : if (!root->orphan_block_rsv) {
3036 513 : root->orphan_block_rsv = block_rsv;
3037 9117 : } else if (block_rsv) {
3038 0 : btrfs_free_block_rsv(root, block_rsv);
3039 : block_rsv = NULL;
3040 : }
3041 :
3042 9630 : if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3043 9630 : &BTRFS_I(inode)->runtime_flags)) {
3044 : #if 0
3045 : /*
3046 : * For proper ENOSPC handling, we should do orphan
3047 : * cleanup when mounting. But this introduces backward
3048 : * compatibility issue.
3049 : */
3050 : if (!xchg(&root->orphan_item_inserted, 1))
3051 : insert = 2;
3052 : else
3053 : insert = 1;
3054 : #endif
3055 : insert = 1;
3056 9630 : atomic_inc(&root->orphan_inodes);
3057 : }
3058 :
3059 9630 : if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
3060 : &BTRFS_I(inode)->runtime_flags))
3061 : reserve = 1;
3062 : spin_unlock(&root->orphan_lock);
3063 :
3064 : /* grab metadata reservation from transaction handle */
3065 9630 : if (reserve) {
3066 9630 : ret = btrfs_orphan_reserve_metadata(trans, inode);
3067 9630 : BUG_ON(ret); /* -ENOSPC in reservation; Logic error? JDM */
3068 : }
3069 :
3070 : /* insert an orphan item to track this unlinked/truncated file */
3071 9630 : if (insert >= 1) {
3072 9630 : ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
3073 9630 : if (ret) {
3074 0 : atomic_dec(&root->orphan_inodes);
3075 0 : if (reserve) {
3076 : clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
3077 : &BTRFS_I(inode)->runtime_flags);
3078 0 : btrfs_orphan_release_metadata(inode);
3079 : }
3080 0 : if (ret != -EEXIST) {
3081 : clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3082 : &BTRFS_I(inode)->runtime_flags);
3083 0 : btrfs_abort_transaction(trans, root, ret);
3084 0 : return ret;
3085 : }
3086 : }
3087 : ret = 0;
3088 : }
3089 :
3090 : /* insert an orphan item to track subvolume contains orphan files */
3091 9630 : if (insert >= 2) {
3092 0 : ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
3093 : root->root_key.objectid);
3094 0 : if (ret && ret != -EEXIST) {
3095 0 : btrfs_abort_transaction(trans, root, ret);
3096 0 : return ret;
3097 : }
3098 : }
3099 : return 0;
3100 : }
3101 :
3102 : /*
3103 : * We have done the truncate/delete so we can go ahead and remove the orphan
3104 : * item for this particular inode.
3105 : */
3106 9631 : static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
3107 : struct inode *inode)
3108 : {
3109 9631 : struct btrfs_root *root = BTRFS_I(inode)->root;
3110 : int delete_item = 0;
3111 : int release_rsv = 0;
3112 : int ret = 0;
3113 :
3114 : spin_lock(&root->orphan_lock);
3115 9631 : if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3116 9631 : &BTRFS_I(inode)->runtime_flags))
3117 : delete_item = 1;
3118 :
3119 9631 : if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
3120 : &BTRFS_I(inode)->runtime_flags))
3121 : release_rsv = 1;
3122 : spin_unlock(&root->orphan_lock);
3123 :
3124 9631 : if (delete_item) {
3125 9631 : atomic_dec(&root->orphan_inodes);
3126 9631 : if (trans)
3127 9631 : ret = btrfs_del_orphan_item(trans, root,
3128 : btrfs_ino(inode));
3129 : }
3130 :
3131 9631 : if (release_rsv)
3132 9630 : btrfs_orphan_release_metadata(inode);
3133 :
3134 9631 : return ret;
3135 : }
3136 :
3137 : /*
3138 : * this cleans up any orphans that may be left on the list from the last use
3139 : * of this root.
3140 : */
3141 785 : int btrfs_orphan_cleanup(struct btrfs_root *root)
3142 : {
3143 : struct btrfs_path *path;
3144 : struct extent_buffer *leaf;
3145 : struct btrfs_key key, found_key;
3146 : struct btrfs_trans_handle *trans;
3147 : struct inode *inode;
3148 : u64 last_objectid = 0;
3149 : int ret = 0, nr_unlink = 0, nr_truncate = 0;
3150 :
3151 785 : if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
3152 : return 0;
3153 :
3154 778 : path = btrfs_alloc_path();
3155 778 : if (!path) {
3156 : ret = -ENOMEM;
3157 : goto out;
3158 : }
3159 778 : path->reada = -1;
3160 :
3161 778 : key.objectid = BTRFS_ORPHAN_OBJECTID;
3162 : btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
3163 778 : key.offset = (u64)-1;
3164 :
3165 : while (1) {
3166 851 : ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3167 851 : if (ret < 0)
3168 : goto out;
3169 :
3170 : /*
3171 : * if ret == 0 means we found what we were searching for, which
3172 : * is weird, but possible, so only screw with path if we didn't
3173 : * find the key and see if we have stuff that matches
3174 : */
3175 851 : if (ret > 0) {
3176 : ret = 0;
3177 794 : if (path->slots[0] == 0)
3178 : break;
3179 794 : path->slots[0]--;
3180 : }
3181 :
3182 : /* pull out the item */
3183 851 : leaf = path->nodes[0];
3184 851 : btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3185 :
3186 : /* make sure the item matches what we want */
3187 851 : if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
3188 : break;
3189 73 : if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY)
3190 : break;
3191 :
3192 : /* release the path since we're done with it */
3193 73 : btrfs_release_path(path);
3194 :
3195 : /*
3196 : * this is where we are basically btrfs_lookup, without the
3197 : * crossing root thing. we store the inode number in the
3198 : * offset of the orphan item.
3199 : */
3200 :
3201 73 : if (found_key.offset == last_objectid) {
3202 0 : btrfs_err(root->fs_info,
3203 : "Error removing orphan entry, stopping orphan cleanup");
3204 : ret = -EINVAL;
3205 0 : goto out;
3206 : }
3207 :
3208 : last_objectid = found_key.offset;
3209 :
3210 73 : found_key.objectid = found_key.offset;
3211 73 : found_key.type = BTRFS_INODE_ITEM_KEY;
3212 73 : found_key.offset = 0;
3213 73 : inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
3214 : ret = PTR_ERR_OR_ZERO(inode);
3215 73 : if (ret && ret != -ESTALE)
3216 : goto out;
3217 :
3218 73 : if (ret == -ESTALE && root == root->fs_info->tree_root) {
3219 : struct btrfs_root *dead_root;
3220 : struct btrfs_fs_info *fs_info = root->fs_info;
3221 : int is_dead_root = 0;
3222 :
3223 : /*
3224 : * this is an orphan in the tree root. Currently these
3225 : * could come from 2 sources:
3226 : * a) a snapshot deletion in progress
3227 : * b) a free space cache inode
3228 : * We need to distinguish those two, as the snapshot
3229 : * orphan must not get deleted.
3230 : * find_dead_roots already ran before us, so if this
3231 : * is a snapshot deletion, we should find the root
3232 : * in the dead_roots list
3233 : */
3234 : spin_lock(&fs_info->trans_lock);
3235 386 : list_for_each_entry(dead_root, &fs_info->dead_roots,
3236 : root_list) {
3237 772 : if (dead_root->root_key.objectid ==
3238 386 : found_key.objectid) {
3239 : is_dead_root = 1;
3240 : break;
3241 : }
3242 : }
3243 : spin_unlock(&fs_info->trans_lock);
3244 72 : if (is_dead_root) {
3245 : /* prevent this orphan from being found again */
3246 72 : key.offset = found_key.objectid - 1;
3247 72 : continue;
3248 : }
3249 : }
3250 : /*
3251 : * Inode is already gone but the orphan item is still there,
3252 : * kill the orphan item.
3253 : */
3254 1 : if (ret == -ESTALE) {
3255 0 : trans = btrfs_start_transaction(root, 1);
3256 0 : if (IS_ERR(trans)) {
3257 0 : ret = PTR_ERR(trans);
3258 0 : goto out;
3259 : }
3260 : btrfs_debug(root->fs_info, "auto deleting %Lu",
3261 : found_key.objectid);
3262 0 : ret = btrfs_del_orphan_item(trans, root,
3263 : found_key.objectid);
3264 0 : btrfs_end_transaction(trans, root);
3265 0 : if (ret)
3266 : goto out;
3267 0 : continue;
3268 : }
3269 :
3270 : /*
3271 : * add this inode to the orphan list so btrfs_orphan_del does
3272 : * the proper thing when we hit it
3273 : */
3274 : set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3275 : &BTRFS_I(inode)->runtime_flags);
3276 1 : atomic_inc(&root->orphan_inodes);
3277 :
3278 : /* if we have links, this was a truncate, lets do that */
3279 1 : if (inode->i_nlink) {
3280 0 : if (WARN_ON(!S_ISREG(inode->i_mode))) {
3281 0 : iput(inode);
3282 0 : continue;
3283 : }
3284 : nr_truncate++;
3285 :
3286 : /* 1 for the orphan item deletion. */
3287 0 : trans = btrfs_start_transaction(root, 1);
3288 0 : if (IS_ERR(trans)) {
3289 0 : iput(inode);
3290 0 : ret = PTR_ERR(trans);
3291 0 : goto out;
3292 : }
3293 0 : ret = btrfs_orphan_add(trans, inode);
3294 0 : btrfs_end_transaction(trans, root);
3295 0 : if (ret) {
3296 0 : iput(inode);
3297 0 : goto out;
3298 : }
3299 :
3300 0 : ret = btrfs_truncate(inode);
3301 0 : if (ret)
3302 0 : btrfs_orphan_del(NULL, inode);
3303 : } else {
3304 : nr_unlink++;
3305 : }
3306 :
3307 : /* this will do delete_inode and everything for us */
3308 1 : iput(inode);
3309 1 : if (ret)
3310 : goto out;
3311 : }
3312 : /* release the path since we're done with it */
3313 778 : btrfs_release_path(path);
3314 :
3315 778 : root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
3316 :
3317 778 : if (root->orphan_block_rsv)
3318 0 : btrfs_block_rsv_release(root, root->orphan_block_rsv,
3319 : (u64)-1);
3320 :
3321 1556 : if (root->orphan_block_rsv ||
3322 : test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
3323 0 : trans = btrfs_join_transaction(root);
3324 0 : if (!IS_ERR(trans))
3325 0 : btrfs_end_transaction(trans, root);
3326 : }
3327 :
3328 : if (nr_unlink)
3329 : btrfs_debug(root->fs_info, "unlinked %d orphans", nr_unlink);
3330 : if (nr_truncate)
3331 : btrfs_debug(root->fs_info, "truncated %d orphans", nr_truncate);
3332 :
3333 : out:
3334 778 : if (ret)
3335 0 : btrfs_crit(root->fs_info,
3336 : "could not do orphan cleanup %d", ret);
3337 778 : btrfs_free_path(path);
3338 778 : return ret;
3339 : }
3340 :
3341 : /*
3342 : * very simple check to peek ahead in the leaf looking for xattrs. If we
3343 : * don't find any xattrs, we know there can't be any acls.
3344 : *
3345 : * slot is the slot the inode is in, objectid is the objectid of the inode
3346 : */
3347 4946 : static noinline int acls_after_inode_item(struct extent_buffer *leaf,
3348 : int slot, u64 objectid,
3349 : int *first_xattr_slot)
3350 : {
3351 : u32 nritems = btrfs_header_nritems(leaf);
3352 : struct btrfs_key found_key;
3353 : static u64 xattr_access = 0;
3354 : static u64 xattr_default = 0;
3355 : int scanned = 0;
3356 :
3357 4946 : if (!xattr_access) {
3358 0 : xattr_access = btrfs_name_hash(POSIX_ACL_XATTR_ACCESS,
3359 : strlen(POSIX_ACL_XATTR_ACCESS));
3360 0 : xattr_default = btrfs_name_hash(POSIX_ACL_XATTR_DEFAULT,
3361 : strlen(POSIX_ACL_XATTR_DEFAULT));
3362 : }
3363 :
3364 4946 : slot++;
3365 4946 : *first_xattr_slot = -1;
3366 10191 : while (slot < nritems) {
3367 4798 : btrfs_item_key_to_cpu(leaf, &found_key, slot);
3368 :
3369 : /* we found a different objectid, there must not be acls */
3370 4798 : if (found_key.objectid != objectid)
3371 : return 0;
3372 :
3373 : /* we found an xattr, assume we've got an acl */
3374 3939 : if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
3375 183 : if (*first_xattr_slot == -1)
3376 182 : *first_xattr_slot = slot;
3377 366 : if (found_key.offset == xattr_access ||
3378 183 : found_key.offset == xattr_default)
3379 : return 1;
3380 : }
3381 :
3382 : /*
3383 : * we found a key greater than an xattr key, there can't
3384 : * be any acls later on
3385 : */
3386 3939 : if (found_key.type > BTRFS_XATTR_ITEM_KEY)
3387 : return 0;
3388 :
3389 299 : slot++;
3390 299 : scanned++;
3391 :
3392 : /*
3393 : * it goes inode, inode backrefs, xattrs, extents,
3394 : * so if there are a ton of hard links to an inode there can
3395 : * be a lot of backrefs. Don't waste time searching too hard,
3396 : * this is just an optimization
3397 : */
3398 299 : if (scanned >= 8)
3399 : break;
3400 : }
3401 : /* we hit the end of the leaf before we found an xattr or
3402 : * something larger than an xattr. We have to assume the inode
3403 : * has acls
3404 : */
3405 447 : if (*first_xattr_slot == -1)
3406 441 : *first_xattr_slot = slot;
3407 : return 1;
3408 : }
3409 :
3410 : /*
3411 : * read an inode from the btree into the in-memory inode
3412 : */
3413 5018 : static void btrfs_read_locked_inode(struct inode *inode)
3414 : {
3415 : struct btrfs_path *path;
3416 4758 : struct extent_buffer *leaf;
3417 : struct btrfs_inode_item *inode_item;
3418 : struct btrfs_timespec *tspec;
3419 5018 : struct btrfs_root *root = BTRFS_I(inode)->root;
3420 : struct btrfs_key location;
3421 : unsigned long ptr;
3422 : int maybe_acls;
3423 : u32 rdev;
3424 : int ret;
3425 : bool filled = false;
3426 : int first_xattr_slot;
3427 :
3428 5018 : ret = btrfs_fill_inode(inode, &rdev);
3429 5018 : if (!ret)
3430 : filled = true;
3431 :
3432 5018 : path = btrfs_alloc_path();
3433 5018 : if (!path)
3434 : goto make_bad;
3435 :
3436 5018 : memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
3437 :
3438 5018 : ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
3439 5018 : if (ret)
3440 : goto make_bad;
3441 :
3442 4946 : leaf = path->nodes[0];
3443 :
3444 4946 : if (filled)
3445 : goto cache_index;
3446 :
3447 9892 : inode_item = btrfs_item_ptr(leaf, path->slots[0],
3448 : struct btrfs_inode_item);
3449 4946 : inode->i_mode = btrfs_inode_mode(leaf, inode_item);
3450 4946 : set_nlink(inode, btrfs_inode_nlink(leaf, inode_item));
3451 : i_uid_write(inode, btrfs_inode_uid(leaf, inode_item));
3452 : i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
3453 : btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
3454 :
3455 : tspec = btrfs_inode_atime(inode_item);
3456 4946 : inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec);
3457 4946 : inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
3458 :
3459 : tspec = btrfs_inode_mtime(inode_item);
3460 4946 : inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec);
3461 4946 : inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
3462 :
3463 : tspec = btrfs_inode_ctime(inode_item);
3464 4946 : inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec);
3465 4946 : inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
3466 :
3467 4946 : inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
3468 4946 : BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
3469 4946 : BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
3470 :
3471 : /*
3472 : * If we were modified in the current generation and evicted from memory
3473 : * and then re-read we need to do a full sync since we don't have any
3474 : * idea about which extents were modified before we were evicted from
3475 : * cache.
3476 : */
3477 4946 : if (BTRFS_I(inode)->last_trans == root->fs_info->generation)
3478 : set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3479 : &BTRFS_I(inode)->runtime_flags);
3480 :
3481 4946 : inode->i_version = btrfs_inode_sequence(leaf, inode_item);
3482 4946 : inode->i_generation = BTRFS_I(inode)->generation;
3483 4946 : inode->i_rdev = 0;
3484 4946 : rdev = btrfs_inode_rdev(leaf, inode_item);
3485 :
3486 4946 : BTRFS_I(inode)->index_cnt = (u64)-1;
3487 4946 : BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
3488 :
3489 : cache_index:
3490 4946 : path->slots[0]++;
3491 9704 : if (inode->i_nlink != 1 ||
3492 4758 : path->slots[0] >= btrfs_header_nritems(leaf))
3493 : goto cache_acl;
3494 :
3495 4677 : btrfs_item_key_to_cpu(leaf, &location, path->slots[0]);
3496 9354 : if (location.objectid != btrfs_ino(inode))
3497 : goto cache_acl;
3498 :
3499 8996 : ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
3500 4498 : if (location.type == BTRFS_INODE_REF_KEY) {
3501 : struct btrfs_inode_ref *ref;
3502 :
3503 4360 : ref = (struct btrfs_inode_ref *)ptr;
3504 4360 : BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref);
3505 138 : } else if (location.type == BTRFS_INODE_EXTREF_KEY) {
3506 : struct btrfs_inode_extref *extref;
3507 :
3508 1 : extref = (struct btrfs_inode_extref *)ptr;
3509 1 : BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf,
3510 : extref);
3511 : }
3512 : cache_acl:
3513 : /*
3514 : * try to precache a NULL acl entry for files that don't have
3515 : * any xattrs or acls
3516 : */
3517 4946 : maybe_acls = acls_after_inode_item(leaf, path->slots[0],
3518 : btrfs_ino(inode), &first_xattr_slot);
3519 4946 : if (first_xattr_slot != -1) {
3520 623 : path->slots[0] = first_xattr_slot;
3521 623 : ret = btrfs_load_inode_props(inode, path);
3522 623 : if (ret)
3523 0 : btrfs_err(root->fs_info,
3524 : "error loading props for ino %llu (root %llu): %d",
3525 : btrfs_ino(inode),
3526 : root->root_key.objectid, ret);
3527 : }
3528 4946 : btrfs_free_path(path);
3529 :
3530 4946 : if (!maybe_acls)
3531 : cache_no_acl(inode);
3532 :
3533 4946 : switch (inode->i_mode & S_IFMT) {
3534 : case S_IFREG:
3535 3228 : inode->i_mapping->a_ops = &btrfs_aops;
3536 3228 : inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
3537 3228 : BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
3538 3228 : inode->i_fop = &btrfs_file_operations;
3539 3228 : inode->i_op = &btrfs_file_inode_operations;
3540 3228 : break;
3541 : case S_IFDIR:
3542 1213 : inode->i_fop = &btrfs_dir_file_operations;
3543 1213 : if (root == root->fs_info->tree_root)
3544 0 : inode->i_op = &btrfs_dir_ro_inode_operations;
3545 : else
3546 1213 : inode->i_op = &btrfs_dir_inode_operations;
3547 : break;
3548 : case S_IFLNK:
3549 267 : inode->i_op = &btrfs_symlink_inode_operations;
3550 267 : inode->i_mapping->a_ops = &btrfs_symlink_aops;
3551 267 : inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
3552 267 : break;
3553 : default:
3554 238 : inode->i_op = &btrfs_special_inode_operations;
3555 238 : init_special_inode(inode, inode->i_mode, rdev);
3556 238 : break;
3557 : }
3558 :
3559 4946 : btrfs_update_iflags(inode);
3560 9964 : return;
3561 :
3562 : make_bad:
3563 72 : btrfs_free_path(path);
3564 72 : make_bad_inode(inode);
3565 : }
3566 :
3567 : /*
3568 : * given a leaf and an inode, copy the inode fields into the leaf
3569 : */
3570 42409 : static void fill_inode_item(struct btrfs_trans_handle *trans,
3571 : struct extent_buffer *leaf,
3572 : struct btrfs_inode_item *item,
3573 : struct inode *inode)
3574 : {
3575 : struct btrfs_map_token token;
3576 :
3577 : btrfs_init_map_token(&token);
3578 :
3579 : btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
3580 : btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
3581 42410 : btrfs_set_token_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size,
3582 : &token);
3583 42410 : btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
3584 42410 : btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
3585 :
3586 42410 : btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item),
3587 42410 : inode->i_atime.tv_sec, &token);
3588 42410 : btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item),
3589 42410 : inode->i_atime.tv_nsec, &token);
3590 :
3591 42410 : btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item),
3592 42410 : inode->i_mtime.tv_sec, &token);
3593 42410 : btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item),
3594 42410 : inode->i_mtime.tv_nsec, &token);
3595 :
3596 42410 : btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item),
3597 42410 : inode->i_ctime.tv_sec, &token);
3598 42410 : btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item),
3599 42410 : inode->i_ctime.tv_nsec, &token);
3600 :
3601 42410 : btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
3602 : &token);
3603 42410 : btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation,
3604 : &token);
3605 42410 : btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
3606 42410 : btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
3607 42410 : btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
3608 42410 : btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
3609 : btrfs_set_token_inode_block_group(leaf, item, 0, &token);
3610 42410 : }
3611 :
3612 : /*
3613 : * copy everything in the in-memory inode into the btree.
3614 : */
3615 43876 : static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
3616 : struct btrfs_root *root, struct inode *inode)
3617 : {
3618 : struct btrfs_inode_item *inode_item;
3619 : struct btrfs_path *path;
3620 : struct extent_buffer *leaf;
3621 : int ret;
3622 :
3623 21938 : path = btrfs_alloc_path();
3624 21938 : if (!path)
3625 : return -ENOMEM;
3626 :
3627 21938 : path->leave_spinning = 1;
3628 21938 : ret = btrfs_lookup_inode(trans, root, path, &BTRFS_I(inode)->location,
3629 : 1);
3630 21938 : if (ret) {
3631 0 : if (ret > 0)
3632 : ret = -ENOENT;
3633 : goto failed;
3634 : }
3635 :
3636 21938 : leaf = path->nodes[0];
3637 43876 : inode_item = btrfs_item_ptr(leaf, path->slots[0],
3638 : struct btrfs_inode_item);
3639 :
3640 21938 : fill_inode_item(trans, leaf, inode_item, inode);
3641 21938 : btrfs_mark_buffer_dirty(leaf);
3642 : btrfs_set_inode_last_trans(trans, inode);
3643 : ret = 0;
3644 : failed:
3645 21938 : btrfs_free_path(path);
3646 21937 : return ret;
3647 : }
3648 :
3649 : /*
3650 : * copy everything in the in-memory inode into the btree.
3651 : */
3652 322344 : noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
3653 : struct btrfs_root *root, struct inode *inode)
3654 : {
3655 : int ret;
3656 :
3657 : /*
3658 : * If the inode is a free space inode, we can deadlock during commit
3659 : * if we put it into the delayed code.
3660 : *
3661 : * The data relocation inode should also be directly updated
3662 : * without delay
3663 : */
3664 172140 : if (!btrfs_is_free_space_inode(inode)
3665 152244 : && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
3666 150202 : btrfs_update_root_times(trans, root);
3667 :
3668 150203 : ret = btrfs_delayed_update_inode(trans, root, inode);
3669 150204 : if (!ret)
3670 : btrfs_set_inode_last_trans(trans, inode);
3671 150204 : return ret;
3672 : }
3673 :
3674 21938 : return btrfs_update_inode_item(trans, root, inode);
3675 : }
3676 :
3677 51685 : noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
3678 : struct btrfs_root *root,
3679 : struct inode *inode)
3680 : {
3681 : int ret;
3682 :
3683 51685 : ret = btrfs_update_inode(trans, root, inode);
3684 51684 : if (ret == -ENOSPC)
3685 0 : return btrfs_update_inode_item(trans, root, inode);
3686 : return ret;
3687 : }
3688 :
3689 : /*
3690 : * unlink helper that gets used here in inode.c and in the tree logging
3691 : * recovery code. It remove a link in a directory with a given name, and
3692 : * also drops the back refs in the inode to the directory
3693 : */
3694 12289 : static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
3695 : struct btrfs_root *root,
3696 : struct inode *dir, struct inode *inode,
3697 : const char *name, int name_len)
3698 : {
3699 : struct btrfs_path *path;
3700 : int ret = 0;
3701 : struct extent_buffer *leaf;
3702 : struct btrfs_dir_item *di;
3703 : struct btrfs_key key;
3704 : u64 index;
3705 : u64 ino = btrfs_ino(inode);
3706 : u64 dir_ino = btrfs_ino(dir);
3707 :
3708 12289 : path = btrfs_alloc_path();
3709 12289 : if (!path) {
3710 : ret = -ENOMEM;
3711 : goto out;
3712 : }
3713 :
3714 12289 : path->leave_spinning = 1;
3715 12289 : di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
3716 : name, name_len, -1);
3717 12289 : if (IS_ERR(di)) {
3718 0 : ret = PTR_ERR(di);
3719 0 : goto err;
3720 : }
3721 12289 : if (!di) {
3722 : ret = -ENOENT;
3723 : goto err;
3724 : }
3725 12289 : leaf = path->nodes[0];
3726 12289 : btrfs_dir_item_key_to_cpu(leaf, di, &key);
3727 12289 : ret = btrfs_delete_one_dir_name(trans, root, path, di);
3728 12289 : if (ret)
3729 : goto err;
3730 12289 : btrfs_release_path(path);
3731 :
3732 : /*
3733 : * If we don't have dir index, we have to get it by looking up
3734 : * the inode ref, since we get the inode ref, remove it directly,
3735 : * it is unnecessary to do delayed deletion.
3736 : *
3737 : * But if we have dir index, needn't search inode ref to get it.
3738 : * Since the inode ref is close to the inode item, it is better
3739 : * that we delay to delete it, and just do this deletion when
3740 : * we update the inode item.
3741 : */
3742 12289 : if (BTRFS_I(inode)->dir_index) {
3743 6256 : ret = btrfs_delayed_delete_inode_ref(inode);
3744 6256 : if (!ret) {
3745 6256 : index = BTRFS_I(inode)->dir_index;
3746 6256 : goto skip_backref;
3747 : }
3748 : }
3749 :
3750 6033 : ret = btrfs_del_inode_ref(trans, root, name, name_len, ino,
3751 : dir_ino, &index);
3752 6033 : if (ret) {
3753 0 : btrfs_info(root->fs_info,
3754 : "failed to delete reference to %.*s, inode %llu parent %llu",
3755 : name_len, name, ino, dir_ino);
3756 0 : btrfs_abort_transaction(trans, root, ret);
3757 0 : goto err;
3758 : }
3759 : skip_backref:
3760 12289 : ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
3761 12289 : if (ret) {
3762 0 : btrfs_abort_transaction(trans, root, ret);
3763 0 : goto err;
3764 : }
3765 :
3766 12289 : ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
3767 : inode, dir_ino);
3768 12289 : if (ret != 0 && ret != -ENOENT) {
3769 0 : btrfs_abort_transaction(trans, root, ret);
3770 0 : goto err;
3771 : }
3772 :
3773 12289 : ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
3774 : dir, index);
3775 12289 : if (ret == -ENOENT)
3776 : ret = 0;
3777 12138 : else if (ret)
3778 0 : btrfs_abort_transaction(trans, root, ret);
3779 : err:
3780 12289 : btrfs_free_path(path);
3781 12289 : if (ret)
3782 : goto out;
3783 :
3784 12289 : btrfs_i_size_write(dir, dir->i_size - name_len * 2);
3785 : inode_inc_iversion(inode);
3786 : inode_inc_iversion(dir);
3787 12289 : inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;
3788 12289 : ret = btrfs_update_inode(trans, root, dir);
3789 : out:
3790 12289 : return ret;
3791 : }
3792 :
3793 9978 : int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
3794 : struct btrfs_root *root,
3795 : struct inode *dir, struct inode *inode,
3796 : const char *name, int name_len)
3797 : {
3798 : int ret;
3799 9978 : ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
3800 9978 : if (!ret) {
3801 9978 : drop_nlink(inode);
3802 9978 : ret = btrfs_update_inode(trans, root, inode);
3803 : }
3804 9978 : return ret;
3805 : }
3806 :
3807 : /*
3808 : * helper to start transaction for unlink and rmdir.
3809 : *
3810 : * unlink and rmdir are special in btrfs, they do not always free space, so
3811 : * if we cannot make our reservations the normal way try and see if there is
3812 : * plenty of slack room in the global reserve to migrate, otherwise we cannot
3813 : * allow the unlink to occur.
3814 : */
3815 9966 : static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
3816 : {
3817 : struct btrfs_trans_handle *trans;
3818 9966 : struct btrfs_root *root = BTRFS_I(dir)->root;
3819 : int ret;
3820 :
3821 : /*
3822 : * 1 for the possible orphan item
3823 : * 1 for the dir item
3824 : * 1 for the dir index
3825 : * 1 for the inode ref
3826 : * 1 for the inode
3827 : */
3828 9966 : trans = btrfs_start_transaction(root, 5);
3829 9966 : if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
3830 : return trans;
3831 :
3832 0 : if (PTR_ERR(trans) == -ENOSPC) {
3833 : u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5);
3834 :
3835 0 : trans = btrfs_start_transaction(root, 0);
3836 0 : if (IS_ERR(trans))
3837 : return trans;
3838 0 : ret = btrfs_cond_migrate_bytes(root->fs_info,
3839 0 : &root->fs_info->trans_block_rsv,
3840 : num_bytes, 5);
3841 0 : if (ret) {
3842 0 : btrfs_end_transaction(trans, root);
3843 0 : return ERR_PTR(ret);
3844 : }
3845 0 : trans->block_rsv = &root->fs_info->trans_block_rsv;
3846 0 : trans->bytes_reserved = num_bytes;
3847 : }
3848 0 : return trans;
3849 : }
3850 :
3851 8623 : static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
3852 : {
3853 8623 : struct btrfs_root *root = BTRFS_I(dir)->root;
3854 : struct btrfs_trans_handle *trans;
3855 8623 : struct inode *inode = dentry->d_inode;
3856 : int ret;
3857 :
3858 8623 : trans = __unlink_start_trans(dir);
3859 8623 : if (IS_ERR(trans))
3860 0 : return PTR_ERR(trans);
3861 :
3862 8623 : btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0);
3863 :
3864 17246 : ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
3865 17246 : dentry->d_name.name, dentry->d_name.len);
3866 8623 : if (ret)
3867 : goto out;
3868 :
3869 8623 : if (inode->i_nlink == 0) {
3870 5438 : ret = btrfs_orphan_add(trans, inode);
3871 : if (ret)
3872 : goto out;
3873 : }
3874 :
3875 : out:
3876 8623 : btrfs_end_transaction(trans, root);
3877 8623 : btrfs_btree_balance_dirty(root);
3878 8623 : return ret;
3879 : }
3880 :
3881 34 : int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
3882 : struct btrfs_root *root,
3883 : struct inode *dir, u64 objectid,
3884 : const char *name, int name_len)
3885 : {
3886 : struct btrfs_path *path;
3887 : struct extent_buffer *leaf;
3888 : struct btrfs_dir_item *di;
3889 : struct btrfs_key key;
3890 : u64 index;
3891 : int ret;
3892 : u64 dir_ino = btrfs_ino(dir);
3893 :
3894 34 : path = btrfs_alloc_path();
3895 34 : if (!path)
3896 : return -ENOMEM;
3897 :
3898 34 : di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
3899 : name, name_len, -1);
3900 34 : if (IS_ERR_OR_NULL(di)) {
3901 0 : if (!di)
3902 : ret = -ENOENT;
3903 : else
3904 0 : ret = PTR_ERR(di);
3905 : goto out;
3906 : }
3907 :
3908 34 : leaf = path->nodes[0];
3909 34 : btrfs_dir_item_key_to_cpu(leaf, di, &key);
3910 34 : WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
3911 34 : ret = btrfs_delete_one_dir_name(trans, root, path, di);
3912 34 : if (ret) {
3913 0 : btrfs_abort_transaction(trans, root, ret);
3914 0 : goto out;
3915 : }
3916 34 : btrfs_release_path(path);
3917 :
3918 34 : ret = btrfs_del_root_ref(trans, root->fs_info->tree_root,
3919 : objectid, root->root_key.objectid,
3920 : dir_ino, &index, name, name_len);
3921 34 : if (ret < 0) {
3922 0 : if (ret != -ENOENT) {
3923 0 : btrfs_abort_transaction(trans, root, ret);
3924 0 : goto out;
3925 : }
3926 0 : di = btrfs_search_dir_index_item(root, path, dir_ino,
3927 : name, name_len);
3928 0 : if (IS_ERR_OR_NULL(di)) {
3929 0 : if (!di)
3930 : ret = -ENOENT;
3931 : else
3932 0 : ret = PTR_ERR(di);
3933 0 : btrfs_abort_transaction(trans, root, ret);
3934 0 : goto out;
3935 : }
3936 :
3937 0 : leaf = path->nodes[0];
3938 0 : btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3939 0 : btrfs_release_path(path);
3940 0 : index = key.offset;
3941 : }
3942 34 : btrfs_release_path(path);
3943 :
3944 34 : ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
3945 34 : if (ret) {
3946 0 : btrfs_abort_transaction(trans, root, ret);
3947 0 : goto out;
3948 : }
3949 :
3950 34 : btrfs_i_size_write(dir, dir->i_size - name_len * 2);
3951 : inode_inc_iversion(dir);
3952 34 : dir->i_mtime = dir->i_ctime = CURRENT_TIME;
3953 34 : ret = btrfs_update_inode_fallback(trans, root, dir);
3954 34 : if (ret)
3955 0 : btrfs_abort_transaction(trans, root, ret);
3956 : out:
3957 34 : btrfs_free_path(path);
3958 34 : return ret;
3959 : }
3960 :
3961 1911 : static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
3962 : {
3963 1911 : struct inode *inode = dentry->d_inode;
3964 : int err = 0;
3965 1911 : struct btrfs_root *root = BTRFS_I(dir)->root;
3966 : struct btrfs_trans_handle *trans;
3967 :
3968 1911 : if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
3969 : return -ENOTEMPTY;
3970 1344 : if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID)
3971 : return -EPERM;
3972 :
3973 1343 : trans = __unlink_start_trans(dir);
3974 1343 : if (IS_ERR(trans))
3975 0 : return PTR_ERR(trans);
3976 :
3977 1343 : if (unlikely(btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
3978 0 : err = btrfs_unlink_subvol(trans, root, dir,
3979 : BTRFS_I(inode)->location.objectid,
3980 0 : dentry->d_name.name,
3981 0 : dentry->d_name.len);
3982 0 : goto out;
3983 : }
3984 :
3985 1343 : err = btrfs_orphan_add(trans, inode);
3986 1343 : if (err)
3987 : goto out;
3988 :
3989 : /* now the directory is empty */
3990 2686 : err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
3991 2686 : dentry->d_name.name, dentry->d_name.len);
3992 1343 : if (!err)
3993 : btrfs_i_size_write(inode, 0);
3994 : out:
3995 1343 : btrfs_end_transaction(trans, root);
3996 1343 : btrfs_btree_balance_dirty(root);
3997 :
3998 1343 : return err;
3999 : }
4000 :
4001 : /*
4002 : * this can truncate away extent items, csum items and directory items.
4003 : * It starts at a high offset and removes keys until it can't find
4004 : * any higher than new_size
4005 : *
4006 : * csum items that cross the new i_size are truncated to the new size
4007 : * as well.
4008 : *
4009 : * min_type is the minimum key type to truncate down to. If set to 0, this
4010 : * will kill all the items on this inode, including the INODE_ITEM_KEY.
4011 : */
4012 13759 : int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
4013 : struct btrfs_root *root,
4014 : struct inode *inode,
4015 : u64 new_size, u32 min_type)
4016 : {
4017 : struct btrfs_path *path;
4018 12914 : struct extent_buffer *leaf;
4019 : struct btrfs_file_extent_item *fi;
4020 : struct btrfs_key key;
4021 : struct btrfs_key found_key;
4022 : u64 extent_start = 0;
4023 : u64 extent_num_bytes = 0;
4024 : u64 extent_offset = 0;
4025 : u64 item_end = 0;
4026 : u64 last_size = (u64)-1;
4027 : u32 found_type = (u8)-1;
4028 : int found_extent;
4029 : int del_item;
4030 : int pending_del_nr = 0;
4031 : int pending_del_slot = 0;
4032 : int extent_type = -1;
4033 : int ret;
4034 : int err = 0;
4035 : u64 ino = btrfs_ino(inode);
4036 :
4037 13759 : BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
4038 :
4039 13759 : path = btrfs_alloc_path();
4040 13759 : if (!path)
4041 : return -ENOMEM;
4042 13759 : path->reada = -1;
4043 :
4044 : /*
4045 : * We want to drop from the next block forward in case this new size is
4046 : * not block aligned since we will be keeping the last block of the
4047 : * extent just the way it is.
4048 : */
4049 17929 : if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
4050 4170 : root == root->fs_info->tree_root)
4051 13431 : btrfs_drop_extent_cache(inode, ALIGN(new_size,
4052 : root->sectorsize), (u64)-1, 0);
4053 :
4054 : /*
4055 : * This function is also used to drop the items in the log tree before
4056 : * we relog the inode, so if root != BTRFS_I(inode)->root, it means
4057 : * it is used to drop the loged items. So we shouldn't kill the delayed
4058 : * items.
4059 : */
4060 13759 : if (min_type == 0 && root == BTRFS_I(inode)->root)
4061 6907 : btrfs_kill_delayed_inode_items(inode);
4062 :
4063 13759 : key.objectid = ino;
4064 13759 : key.offset = (u64)-1;
4065 13759 : key.type = (u8)-1;
4066 :
4067 : search_again:
4068 14406 : path->leave_spinning = 1;
4069 14406 : ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
4070 14406 : if (ret < 0) {
4071 : err = ret;
4072 : goto out;
4073 : }
4074 :
4075 14406 : if (ret > 0) {
4076 : /* there are no items in the tree for us to truncate, we're
4077 : * done
4078 : */
4079 14406 : if (path->slots[0] == 0)
4080 : goto out;
4081 14189 : path->slots[0]--;
4082 : }
4083 :
4084 : while (1) {
4085 : fi = NULL;
4086 38229 : leaf = path->nodes[0];
4087 38229 : btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4088 38229 : found_type = btrfs_key_type(&found_key);
4089 :
4090 38229 : if (found_key.objectid != ino)
4091 : break;
4092 :
4093 38123 : if (found_type < min_type)
4094 : break;
4095 :
4096 33833 : item_end = found_key.offset;
4097 33833 : if (found_type == BTRFS_EXTENT_DATA_KEY) {
4098 41164 : fi = btrfs_item_ptr(leaf, path->slots[0],
4099 : struct btrfs_file_extent_item);
4100 20582 : extent_type = btrfs_file_extent_type(leaf, fi);
4101 20582 : if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
4102 19228 : item_end +=
4103 : btrfs_file_extent_num_bytes(leaf, fi);
4104 1354 : } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
4105 1354 : item_end += btrfs_file_extent_inline_len(leaf,
4106 : path->slots[0], fi);
4107 : }
4108 20582 : item_end--;
4109 : }
4110 33833 : if (found_type > min_type) {
4111 : del_item = 1;
4112 : } else {
4113 9815 : if (item_end < new_size)
4114 : break;
4115 8177 : if (found_key.offset >= new_size)
4116 : del_item = 1;
4117 : else
4118 : del_item = 0;
4119 : }
4120 : found_extent = 0;
4121 : /* FIXME, shrink the extent if the ref count is only 1 */
4122 32195 : if (found_type != BTRFS_EXTENT_DATA_KEY)
4123 : goto delete;
4124 :
4125 18944 : if (del_item)
4126 18348 : last_size = found_key.offset;
4127 : else
4128 : last_size = new_size;
4129 :
4130 18944 : if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
4131 : u64 num_dec;
4132 : extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
4133 17608 : if (!del_item) {
4134 : u64 orig_num_bytes =
4135 : btrfs_file_extent_num_bytes(leaf, fi);
4136 596 : extent_num_bytes = ALIGN(new_size -
4137 : found_key.offset,
4138 : root->sectorsize);
4139 : btrfs_set_file_extent_num_bytes(leaf, fi,
4140 : extent_num_bytes);
4141 596 : num_dec = (orig_num_bytes -
4142 : extent_num_bytes);
4143 596 : if (test_bit(BTRFS_ROOT_REF_COWS,
4144 596 : &root->state) &&
4145 : extent_start != 0)
4146 331 : inode_sub_bytes(inode, num_dec);
4147 596 : btrfs_mark_buffer_dirty(leaf);
4148 : } else {
4149 : extent_num_bytes =
4150 : btrfs_file_extent_disk_num_bytes(leaf,
4151 : fi);
4152 34024 : extent_offset = found_key.offset -
4153 : btrfs_file_extent_offset(leaf, fi);
4154 :
4155 : /* FIXME blocksize != 4096 */
4156 : num_dec = btrfs_file_extent_num_bytes(leaf, fi);
4157 17012 : if (extent_start != 0) {
4158 : found_extent = 1;
4159 12923 : if (test_bit(BTRFS_ROOT_REF_COWS,
4160 : &root->state))
4161 9072 : inode_sub_bytes(inode, num_dec);
4162 : }
4163 : }
4164 1336 : } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
4165 : /*
4166 : * we can't truncate inline items that have had
4167 : * special encodings
4168 : */
4169 1336 : if (!del_item &&
4170 0 : btrfs_file_extent_compression(leaf, fi) == 0 &&
4171 0 : btrfs_file_extent_encryption(leaf, fi) == 0 &&
4172 0 : btrfs_file_extent_other_encoding(leaf, fi) == 0) {
4173 0 : u32 size = new_size - found_key.offset;
4174 :
4175 0 : if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
4176 0 : inode_sub_bytes(inode, item_end + 1 -
4177 : new_size);
4178 :
4179 : /*
4180 : * update the ram bytes to properly reflect
4181 : * the new size of our item
4182 : */
4183 0 : btrfs_set_file_extent_ram_bytes(leaf, fi, size);
4184 : size =
4185 : btrfs_file_extent_calc_inline_size(size);
4186 0 : btrfs_truncate_item(root, path, size, 1);
4187 1336 : } else if (test_bit(BTRFS_ROOT_REF_COWS,
4188 : &root->state)) {
4189 1336 : inode_sub_bytes(inode, item_end + 1 -
4190 1336 : found_key.offset);
4191 : }
4192 : }
4193 : delete:
4194 32195 : if (del_item) {
4195 31599 : if (!pending_del_nr) {
4196 : /* no pending yet, add ourselves */
4197 11847 : pending_del_slot = path->slots[0];
4198 : pending_del_nr = 1;
4199 39504 : } else if (pending_del_nr &&
4200 19752 : path->slots[0] + 1 == pending_del_slot) {
4201 : /* hop on the pending chunk */
4202 19752 : pending_del_nr++;
4203 19752 : pending_del_slot = path->slots[0];
4204 : } else {
4205 0 : BUG();
4206 : }
4207 : } else {
4208 : break;
4209 : }
4210 44522 : if (found_extent &&
4211 3851 : (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
4212 3851 : root == root->fs_info->tree_root)) {
4213 12914 : btrfs_set_path_blocking(path);
4214 12914 : ret = btrfs_free_extent(trans, root, extent_start,
4215 : extent_num_bytes, 0,
4216 : btrfs_header_owner(leaf),
4217 : ino, extent_offset, 0);
4218 12914 : BUG_ON(ret);
4219 : }
4220 :
4221 31599 : if (found_type == BTRFS_INODE_ITEM_KEY)
4222 : break;
4223 :
4224 24687 : if (path->slots[0] == 0 ||
4225 : path->slots[0] != pending_del_slot) {
4226 647 : if (pending_del_nr) {
4227 647 : ret = btrfs_del_items(trans, root, path,
4228 : pending_del_slot,
4229 : pending_del_nr);
4230 647 : if (ret) {
4231 0 : btrfs_abort_transaction(trans,
4232 : root, ret);
4233 0 : goto error;
4234 : }
4235 : pending_del_nr = 0;
4236 : }
4237 647 : btrfs_release_path(path);
4238 647 : goto search_again;
4239 : } else {
4240 24040 : path->slots[0]--;
4241 : }
4242 24040 : }
4243 : out:
4244 13759 : if (pending_del_nr) {
4245 11200 : ret = btrfs_del_items(trans, root, path, pending_del_slot,
4246 : pending_del_nr);
4247 11200 : if (ret)
4248 0 : btrfs_abort_transaction(trans, root, ret);
4249 : }
4250 : error:
4251 21563 : if (last_size != (u64)-1 &&
4252 7804 : root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
4253 7801 : btrfs_ordered_update_i_size(inode, last_size, NULL);
4254 13759 : btrfs_free_path(path);
4255 13759 : return err;
4256 : }
4257 :
4258 : /*
4259 : * btrfs_truncate_page - read, zero a chunk and write a page
4260 : * @inode - inode that we're zeroing
4261 : * @from - the offset to start zeroing
4262 : * @len - the length to zero, 0 to zero the entire range respective to the
4263 : * offset
4264 : * @front - zero up to the offset instead of from the offset on
4265 : *
4266 : * This will find the page for the "from" offset and cow the page and zero the
4267 : * part we want to zero. This is used with truncate and hole punching.
4268 : */
4269 6432 : int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
4270 : int front)
4271 : {
4272 6432 : struct address_space *mapping = inode->i_mapping;
4273 6432 : struct btrfs_root *root = BTRFS_I(inode)->root;
4274 6432 : struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4275 : struct btrfs_ordered_extent *ordered;
4276 6432 : struct extent_state *cached_state = NULL;
4277 : char *kaddr;
4278 6432 : u32 blocksize = root->sectorsize;
4279 6432 : pgoff_t index = from >> PAGE_CACHE_SHIFT;
4280 6432 : unsigned offset = from & (PAGE_CACHE_SIZE-1);
4281 2356 : struct page *page;
4282 : gfp_t mask = btrfs_alloc_write_mask(mapping);
4283 : int ret = 0;
4284 : u64 page_start;
4285 : u64 page_end;
4286 :
4287 6432 : if ((offset & (blocksize - 1)) == 0 &&
4288 0 : (!len || ((len & (blocksize - 1)) == 0)))
4289 : goto out;
4290 2351 : ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
4291 2351 : if (ret)
4292 : goto out;
4293 :
4294 : again:
4295 : page = find_or_create_page(mapping, index, mask);
4296 2356 : if (!page) {
4297 0 : btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
4298 : ret = -ENOMEM;
4299 0 : goto out;
4300 : }
4301 :
4302 2356 : page_start = page_offset(page);
4303 2356 : page_end = page_start + PAGE_CACHE_SIZE - 1;
4304 :
4305 2356 : if (!PageUptodate(page)) {
4306 : ret = btrfs_readpage(NULL, page);
4307 887 : lock_page(page);
4308 887 : if (page->mapping != mapping) {
4309 0 : unlock_page(page);
4310 0 : page_cache_release(page);
4311 0 : goto again;
4312 : }
4313 887 : if (!PageUptodate(page)) {
4314 : ret = -EIO;
4315 : goto out_unlock;
4316 : }
4317 : }
4318 2356 : wait_on_page_writeback(page);
4319 :
4320 2356 : lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state);
4321 2356 : set_page_extent_mapped(page);
4322 :
4323 2356 : ordered = btrfs_lookup_ordered_extent(inode, page_start);
4324 2356 : if (ordered) {
4325 5 : unlock_extent_cached(io_tree, page_start, page_end,
4326 : &cached_state, GFP_NOFS);
4327 5 : unlock_page(page);
4328 5 : page_cache_release(page);
4329 5 : btrfs_start_ordered_extent(inode, ordered, 1);
4330 5 : btrfs_put_ordered_extent(ordered);
4331 5 : goto again;
4332 : }
4333 :
4334 2351 : clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
4335 : EXTENT_DIRTY | EXTENT_DELALLOC |
4336 : EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
4337 : 0, 0, &cached_state, GFP_NOFS);
4338 :
4339 2351 : ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
4340 : &cached_state);
4341 2351 : if (ret) {
4342 0 : unlock_extent_cached(io_tree, page_start, page_end,
4343 : &cached_state, GFP_NOFS);
4344 0 : goto out_unlock;
4345 : }
4346 :
4347 : if (offset != PAGE_CACHE_SIZE) {
4348 2351 : if (!len)
4349 2351 : len = PAGE_CACHE_SIZE - offset;
4350 : kaddr = kmap(page);
4351 2351 : if (front)
4352 18 : memset(kaddr, 0, offset);
4353 : else
4354 2333 : memset(kaddr + offset, 0, len);
4355 : flush_dcache_page(page);
4356 : kunmap(page);
4357 : }
4358 : ClearPageChecked(page);
4359 2351 : set_page_dirty(page);
4360 2351 : unlock_extent_cached(io_tree, page_start, page_end, &cached_state,
4361 : GFP_NOFS);
4362 :
4363 : out_unlock:
4364 2351 : if (ret)
4365 0 : btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
4366 2351 : unlock_page(page);
4367 2351 : page_cache_release(page);
4368 : out:
4369 6432 : return ret;
4370 : }
4371 :
4372 5031 : static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode,
4373 : u64 offset, u64 len)
4374 : {
4375 : struct btrfs_trans_handle *trans;
4376 : int ret;
4377 :
4378 : /*
4379 : * Still need to make sure the inode looks like it's been updated so
4380 : * that any holes get logged if we fsync.
4381 : */
4382 10062 : if (btrfs_fs_incompat(root->fs_info, NO_HOLES)) {
4383 3 : BTRFS_I(inode)->last_trans = root->fs_info->generation;
4384 3 : BTRFS_I(inode)->last_sub_trans = root->log_transid;
4385 3 : BTRFS_I(inode)->last_log_commit = root->last_log_commit;
4386 3 : return 0;
4387 : }
4388 :
4389 : /*
4390 : * 1 - for the one we're dropping
4391 : * 1 - for the one we're adding
4392 : * 1 - for updating the inode.
4393 : */
4394 5028 : trans = btrfs_start_transaction(root, 3);
4395 5028 : if (IS_ERR(trans))
4396 0 : return PTR_ERR(trans);
4397 :
4398 5028 : ret = btrfs_drop_extents(trans, root, inode, offset, offset + len, 1);
4399 5027 : if (ret) {
4400 0 : btrfs_abort_transaction(trans, root, ret);
4401 0 : btrfs_end_transaction(trans, root);
4402 0 : return ret;
4403 : }
4404 :
4405 5027 : ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset,
4406 : 0, 0, len, 0, len, 0, 0, 0);
4407 5028 : if (ret)
4408 0 : btrfs_abort_transaction(trans, root, ret);
4409 : else
4410 5028 : btrfs_update_inode(trans, root, inode);
4411 5028 : btrfs_end_transaction(trans, root);
4412 5028 : return ret;
4413 : }
4414 :
4415 : /*
4416 : * This function puts in dummy file extents for the area we're creating a hole
4417 : * for. So if we are truncating this file to a larger size we need to insert
4418 : * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
4419 : * the range between oldsize and size
4420 : */
4421 4955 : int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
4422 : {
4423 4955 : struct btrfs_root *root = BTRFS_I(inode)->root;
4424 4955 : struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4425 5508 : struct extent_map *em = NULL;
4426 4955 : struct extent_state *cached_state = NULL;
4427 4955 : struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
4428 4955 : u64 hole_start = ALIGN(oldsize, root->sectorsize);
4429 4955 : u64 block_end = ALIGN(size, root->sectorsize);
4430 : u64 last_byte;
4431 : u64 cur_offset;
4432 : u64 hole_size;
4433 : int err = 0;
4434 :
4435 : /*
4436 : * If our size started in the middle of a page we need to zero out the
4437 : * rest of the page before we expand the i_size, otherwise we could
4438 : * expose stale data.
4439 : */
4440 4955 : err = btrfs_truncate_page(inode, oldsize, 0, 0);
4441 4955 : if (err)
4442 : return err;
4443 :
4444 4955 : if (size <= hole_start)
4445 : return 0;
4446 :
4447 : while (1) {
4448 : struct btrfs_ordered_extent *ordered;
4449 :
4450 4954 : lock_extent_bits(io_tree, hole_start, block_end - 1, 0,
4451 : &cached_state);
4452 4954 : ordered = btrfs_lookup_ordered_range(inode, hole_start,
4453 : block_end - hole_start);
4454 4954 : if (!ordered)
4455 : break;
4456 0 : unlock_extent_cached(io_tree, hole_start, block_end - 1,
4457 : &cached_state, GFP_NOFS);
4458 0 : btrfs_start_ordered_extent(inode, ordered, 1);
4459 0 : btrfs_put_ordered_extent(ordered);
4460 0 : }
4461 :
4462 : cur_offset = hole_start;
4463 : while (1) {
4464 5508 : em = btrfs_get_extent(inode, NULL, 0, cur_offset,
4465 : block_end - cur_offset, 0);
4466 5508 : if (IS_ERR(em)) {
4467 0 : err = PTR_ERR(em);
4468 : em = NULL;
4469 0 : break;
4470 : }
4471 5508 : last_byte = min(extent_map_end(em), block_end);
4472 5508 : last_byte = ALIGN(last_byte , root->sectorsize);
4473 5508 : if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
4474 : struct extent_map *hole_em;
4475 5031 : hole_size = last_byte - cur_offset;
4476 :
4477 5031 : err = maybe_insert_hole(root, inode, cur_offset,
4478 : hole_size);
4479 5031 : if (err)
4480 : break;
4481 5031 : btrfs_drop_extent_cache(inode, cur_offset,
4482 : cur_offset + hole_size - 1, 0);
4483 5031 : hole_em = alloc_extent_map();
4484 5031 : if (!hole_em) {
4485 : set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
4486 : &BTRFS_I(inode)->runtime_flags);
4487 : goto next;
4488 : }
4489 5031 : hole_em->start = cur_offset;
4490 5031 : hole_em->len = hole_size;
4491 5031 : hole_em->orig_start = cur_offset;
4492 :
4493 5031 : hole_em->block_start = EXTENT_MAP_HOLE;
4494 5031 : hole_em->block_len = 0;
4495 5031 : hole_em->orig_block_len = 0;
4496 5031 : hole_em->ram_bytes = hole_size;
4497 5031 : hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
4498 5031 : hole_em->compress_type = BTRFS_COMPRESS_NONE;
4499 5031 : hole_em->generation = root->fs_info->generation;
4500 :
4501 : while (1) {
4502 5031 : write_lock(&em_tree->lock);
4503 5031 : err = add_extent_mapping(em_tree, hole_em, 1);
4504 : write_unlock(&em_tree->lock);
4505 5031 : if (err != -EEXIST)
4506 : break;
4507 0 : btrfs_drop_extent_cache(inode, cur_offset,
4508 : cur_offset +
4509 : hole_size - 1, 0);
4510 0 : }
4511 5031 : free_extent_map(hole_em);
4512 : }
4513 : next:
4514 5508 : free_extent_map(em);
4515 : em = NULL;
4516 : cur_offset = last_byte;
4517 5508 : if (cur_offset >= block_end)
4518 : break;
4519 : }
4520 4954 : free_extent_map(em);
4521 4954 : unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
4522 : GFP_NOFS);
4523 4954 : return err;
4524 : }
4525 :
4526 7364 : static int btrfs_setsize(struct inode *inode, struct iattr *attr)
4527 : {
4528 3682 : struct btrfs_root *root = BTRFS_I(inode)->root;
4529 : struct btrfs_trans_handle *trans;
4530 : loff_t oldsize = i_size_read(inode);
4531 3682 : loff_t newsize = attr->ia_size;
4532 3682 : int mask = attr->ia_valid;
4533 : int ret;
4534 :
4535 : /*
4536 : * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
4537 : * special case where we need to update the times despite not having
4538 : * these flags set. For all other operations the VFS set these flags
4539 : * explicitly if it wants a timestamp update.
4540 : */
4541 3682 : if (newsize != oldsize) {
4542 : inode_inc_iversion(inode);
4543 1773 : if (!(mask & (ATTR_CTIME | ATTR_MTIME)))
4544 1304 : inode->i_ctime = inode->i_mtime =
4545 1304 : current_fs_time(inode->i_sb);
4546 : }
4547 :
4548 3682 : if (newsize > oldsize) {
4549 959 : truncate_pagecache(inode, newsize);
4550 959 : ret = btrfs_cont_expand(inode, oldsize, newsize);
4551 959 : if (ret)
4552 : return ret;
4553 :
4554 959 : trans = btrfs_start_transaction(root, 1);
4555 959 : if (IS_ERR(trans))
4556 0 : return PTR_ERR(trans);
4557 :
4558 : i_size_write(inode, newsize);
4559 959 : btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
4560 959 : ret = btrfs_update_inode(trans, root, inode);
4561 959 : btrfs_end_transaction(trans, root);
4562 : } else {
4563 :
4564 : /*
4565 : * We're truncating a file that used to have good data down to
4566 : * zero. Make sure it gets into the ordered flush list so that
4567 : * any new writes get down to disk quickly.
4568 : */
4569 2723 : if (newsize == 0)
4570 : set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
4571 : &BTRFS_I(inode)->runtime_flags);
4572 :
4573 : /*
4574 : * 1 for the orphan item we're going to add
4575 : * 1 for the orphan item deletion.
4576 : */
4577 2723 : trans = btrfs_start_transaction(root, 2);
4578 2723 : if (IS_ERR(trans))
4579 0 : return PTR_ERR(trans);
4580 :
4581 : /*
4582 : * We need to do this in case we fail at _any_ point during the
4583 : * actual truncate. Once we do the truncate_setsize we could
4584 : * invalidate pages which forces any outstanding ordered io to
4585 : * be instantly completed which will give us extents that need
4586 : * to be truncated. If we fail to get an orphan inode down we
4587 : * could have left over extents that were never meant to live,
4588 : * so we need to garuntee from this point on that everything
4589 : * will be consistent.
4590 : */
4591 2723 : ret = btrfs_orphan_add(trans, inode);
4592 2723 : btrfs_end_transaction(trans, root);
4593 2723 : if (ret)
4594 : return ret;
4595 :
4596 : /* we don't support swapfiles, so vmtruncate shouldn't fail */
4597 2723 : truncate_setsize(inode, newsize);
4598 :
4599 : /* Disable nonlocked read DIO to avoid the end less truncate */
4600 : btrfs_inode_block_unlocked_dio(inode);
4601 2723 : inode_dio_wait(inode);
4602 : btrfs_inode_resume_unlocked_dio(inode);
4603 :
4604 2723 : ret = btrfs_truncate(inode);
4605 2723 : if (ret && inode->i_nlink) {
4606 : int err;
4607 :
4608 : /*
4609 : * failed to truncate, disk_i_size is only adjusted down
4610 : * as we remove extents, so it should represent the true
4611 : * size of the inode, so reset the in memory size and
4612 : * delete our orphan entry.
4613 : */
4614 0 : trans = btrfs_join_transaction(root);
4615 0 : if (IS_ERR(trans)) {
4616 0 : btrfs_orphan_del(NULL, inode);
4617 : return ret;
4618 : }
4619 0 : i_size_write(inode, BTRFS_I(inode)->disk_i_size);
4620 0 : err = btrfs_orphan_del(trans, inode);
4621 0 : if (err)
4622 0 : btrfs_abort_transaction(trans, root, err);
4623 0 : btrfs_end_transaction(trans, root);
4624 : }
4625 : }
4626 :
4627 : return ret;
4628 : }
4629 :
4630 11752 : static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
4631 : {
4632 8070 : struct inode *inode = dentry->d_inode;
4633 8070 : struct btrfs_root *root = BTRFS_I(inode)->root;
4634 : int err;
4635 :
4636 8070 : if (btrfs_root_readonly(root))
4637 : return -EROFS;
4638 :
4639 8070 : err = inode_change_ok(inode, attr);
4640 8070 : if (err)
4641 : return err;
4642 :
4643 8070 : if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
4644 3682 : err = btrfs_setsize(inode, attr);
4645 3682 : if (err)
4646 : return err;
4647 : }
4648 :
4649 8070 : if (attr->ia_valid) {
4650 8070 : setattr_copy(inode, attr);
4651 : inode_inc_iversion(inode);
4652 8070 : err = btrfs_dirty_inode(inode);
4653 :
4654 8070 : if (!err && attr->ia_valid & ATTR_MODE)
4655 241 : err = posix_acl_chmod(inode, inode->i_mode);
4656 : }
4657 :
4658 8070 : return err;
4659 : }
4660 :
4661 : /*
4662 : * While truncating the inode pages during eviction, we get the VFS calling
4663 : * btrfs_invalidatepage() against each page of the inode. This is slow because
4664 : * the calls to btrfs_invalidatepage() result in a huge amount of calls to
4665 : * lock_extent_bits() and clear_extent_bit(), which keep merging and splitting
4666 : * extent_state structures over and over, wasting lots of time.
4667 : *
4668 : * Therefore if the inode is being evicted, let btrfs_invalidatepage() skip all
4669 : * those expensive operations on a per page basis and do only the ordered io
4670 : * finishing, while we release here the extent_map and extent_state structures,
4671 : * without the excessive merging and splitting.
4672 : */
4673 25704 : static void evict_inode_truncate_pages(struct inode *inode)
4674 : {
4675 25704 : struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4676 25704 : struct extent_map_tree *map_tree = &BTRFS_I(inode)->extent_tree;
4677 : struct rb_node *node;
4678 :
4679 : ASSERT(inode->i_state & I_FREEING);
4680 25704 : truncate_inode_pages_final(&inode->i_data);
4681 :
4682 25704 : write_lock(&map_tree->lock);
4683 126766 : while (!RB_EMPTY_ROOT(&map_tree->map)) {
4684 : struct extent_map *em;
4685 :
4686 75358 : node = rb_first(&map_tree->map);
4687 : em = rb_entry(node, struct extent_map, rb_node);
4688 : clear_bit(EXTENT_FLAG_PINNED, &em->flags);
4689 : clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
4690 75358 : remove_extent_mapping(map_tree, em);
4691 75358 : free_extent_map(em);
4692 75358 : if (need_resched()) {
4693 : write_unlock(&map_tree->lock);
4694 7 : cond_resched();
4695 7 : write_lock(&map_tree->lock);
4696 : }
4697 : }
4698 : write_unlock(&map_tree->lock);
4699 :
4700 : spin_lock(&io_tree->lock);
4701 42529 : while (!RB_EMPTY_ROOT(&io_tree->state)) {
4702 : struct extent_state *state;
4703 16825 : struct extent_state *cached_state = NULL;
4704 :
4705 16825 : node = rb_first(&io_tree->state);
4706 16825 : state = rb_entry(node, struct extent_state, rb_node);
4707 16825 : atomic_inc(&state->refs);
4708 : spin_unlock(&io_tree->lock);
4709 :
4710 16825 : lock_extent_bits(io_tree, state->start, state->end,
4711 : 0, &cached_state);
4712 16825 : clear_extent_bit(io_tree, state->start, state->end,
4713 : EXTENT_LOCKED | EXTENT_DIRTY |
4714 : EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
4715 : EXTENT_DEFRAG, 1, 1,
4716 : &cached_state, GFP_NOFS);
4717 16825 : free_extent_state(state);
4718 :
4719 16825 : cond_resched();
4720 : spin_lock(&io_tree->lock);
4721 : }
4722 : spin_unlock(&io_tree->lock);
4723 25704 : }
4724 :
4725 25704 : void btrfs_evict_inode(struct inode *inode)
4726 : {
4727 : struct btrfs_trans_handle *trans;
4728 25704 : struct btrfs_root *root = BTRFS_I(inode)->root;
4729 : struct btrfs_block_rsv *rsv, *global_rsv;
4730 : u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
4731 : int ret;
4732 :
4733 25704 : trace_btrfs_inode_evict(inode);
4734 :
4735 25704 : evict_inode_truncate_pages(inode);
4736 :
4737 44501 : if (inode->i_nlink &&
4738 18742 : ((btrfs_root_refs(&root->root_item) != 0 &&
4739 19365 : root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) ||
4740 623 : btrfs_is_free_space_inode(inode)))
4741 : goto no_delete;
4742 :
4743 7183 : if (is_bad_inode(inode)) {
4744 0 : btrfs_orphan_del(NULL, inode);
4745 0 : goto no_delete;
4746 : }
4747 : /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */
4748 7183 : btrfs_wait_ordered_range(inode, 0, (u64)-1);
4749 :
4750 7183 : if (root->fs_info->log_root_recovering) {
4751 0 : BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
4752 : &BTRFS_I(inode)->runtime_flags));
4753 : goto no_delete;
4754 : }
4755 :
4756 7183 : if (inode->i_nlink > 0) {
4757 276 : BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
4758 : root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID);
4759 : goto no_delete;
4760 : }
4761 :
4762 6907 : ret = btrfs_commit_inode_delayed_inode(inode);
4763 6907 : if (ret) {
4764 0 : btrfs_orphan_del(NULL, inode);
4765 0 : goto no_delete;
4766 : }
4767 :
4768 6907 : rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
4769 6907 : if (!rsv) {
4770 0 : btrfs_orphan_del(NULL, inode);
4771 0 : goto no_delete;
4772 : }
4773 6907 : rsv->size = min_size;
4774 6907 : rsv->failfast = 1;
4775 6907 : global_rsv = &root->fs_info->global_block_rsv;
4776 :
4777 : btrfs_i_size_write(inode, 0);
4778 :
4779 : /*
4780 : * This is a bit simpler than btrfs_truncate since we've already
4781 : * reserved our space for our orphan item in the unlink, so we just
4782 : * need to reserve some slack space in case we add bytes and update
4783 : * inode item when doing the truncate.
4784 : */
4785 : while (1) {
4786 6907 : ret = btrfs_block_rsv_refill(root, rsv, min_size,
4787 : BTRFS_RESERVE_FLUSH_LIMIT);
4788 :
4789 : /*
4790 : * Try and steal from the global reserve since we will
4791 : * likely not use this space anyway, we want to try as
4792 : * hard as possible to get this to work.
4793 : */
4794 6907 : if (ret)
4795 0 : ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size);
4796 :
4797 6907 : if (ret) {
4798 0 : btrfs_warn(root->fs_info,
4799 : "Could not get space for a delete, will truncate on mount %d",
4800 : ret);
4801 0 : btrfs_orphan_del(NULL, inode);
4802 0 : btrfs_free_block_rsv(root, rsv);
4803 0 : goto no_delete;
4804 : }
4805 :
4806 6907 : trans = btrfs_join_transaction(root);
4807 6907 : if (IS_ERR(trans)) {
4808 0 : btrfs_orphan_del(NULL, inode);
4809 0 : btrfs_free_block_rsv(root, rsv);
4810 0 : goto no_delete;
4811 : }
4812 :
4813 6907 : trans->block_rsv = rsv;
4814 :
4815 6907 : ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
4816 6907 : if (ret != -ENOSPC)
4817 : break;
4818 :
4819 0 : trans->block_rsv = &root->fs_info->trans_block_rsv;
4820 0 : btrfs_end_transaction(trans, root);
4821 : trans = NULL;
4822 0 : btrfs_btree_balance_dirty(root);
4823 0 : }
4824 :
4825 6907 : btrfs_free_block_rsv(root, rsv);
4826 :
4827 : /*
4828 : * Errors here aren't a big deal, it just means we leave orphan items
4829 : * in the tree. They will be cleaned up on the next mount.
4830 : */
4831 6907 : if (ret == 0) {
4832 6907 : trans->block_rsv = root->orphan_block_rsv;
4833 6907 : btrfs_orphan_del(trans, inode);
4834 : } else {
4835 0 : btrfs_orphan_del(NULL, inode);
4836 : }
4837 :
4838 6907 : trans->block_rsv = &root->fs_info->trans_block_rsv;
4839 13773 : if (!(root == root->fs_info->tree_root ||
4840 6866 : root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
4841 6866 : btrfs_return_ino(root, btrfs_ino(inode));
4842 :
4843 6907 : btrfs_end_transaction(trans, root);
4844 6907 : btrfs_btree_balance_dirty(root);
4845 : no_delete:
4846 25704 : btrfs_remove_delayed_node(inode);
4847 25704 : clear_inode(inode);
4848 25704 : return;
4849 : }
4850 :
4851 : /*
4852 : * this returns the key found in the dir entry in the location pointer.
4853 : * If no dir entries were found, location->objectid is 0.
4854 : */
4855 40768 : static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
4856 : struct btrfs_key *location)
4857 : {
4858 40768 : const char *name = dentry->d_name.name;
4859 40768 : int namelen = dentry->d_name.len;
4860 : struct btrfs_dir_item *di;
4861 : struct btrfs_path *path;
4862 40768 : struct btrfs_root *root = BTRFS_I(dir)->root;
4863 : int ret = 0;
4864 :
4865 40768 : path = btrfs_alloc_path();
4866 40769 : if (!path)
4867 : return -ENOMEM;
4868 :
4869 40769 : di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), name,
4870 : namelen, 0);
4871 40768 : if (IS_ERR(di))
4872 0 : ret = PTR_ERR(di);
4873 :
4874 40768 : if (IS_ERR_OR_NULL(di))
4875 : goto out_err;
4876 :
4877 12304 : btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
4878 : out:
4879 40768 : btrfs_free_path(path);
4880 : return ret;
4881 : out_err:
4882 28464 : location->objectid = 0;
4883 : goto out;
4884 : }
4885 :
4886 : /*
4887 : * when we hit a tree root in a directory, the btrfs part of the inode
4888 : * needs to be changed to reflect the root directory of the tree root. This
4889 : * is kind of like crossing a mount point.
4890 : */
4891 205 : static int fixup_tree_root_location(struct btrfs_root *root,
4892 : struct inode *dir,
4893 : struct dentry *dentry,
4894 : struct btrfs_key *location,
4895 : struct btrfs_root **sub_root)
4896 : {
4897 : struct btrfs_path *path;
4898 : struct btrfs_root *new_root;
4899 : struct btrfs_root_ref *ref;
4900 : struct extent_buffer *leaf;
4901 : int ret;
4902 : int err = 0;
4903 :
4904 205 : path = btrfs_alloc_path();
4905 205 : if (!path) {
4906 : err = -ENOMEM;
4907 : goto out;
4908 : }
4909 :
4910 : err = -ENOENT;
4911 410 : ret = btrfs_find_item(root->fs_info->tree_root, path,
4912 205 : BTRFS_I(dir)->root->root_key.objectid,
4913 : location->objectid, BTRFS_ROOT_REF_KEY, NULL);
4914 205 : if (ret) {
4915 0 : if (ret < 0)
4916 : err = ret;
4917 : goto out;
4918 : }
4919 :
4920 205 : leaf = path->nodes[0];
4921 410 : ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
4922 410 : if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) ||
4923 205 : btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len)
4924 : goto out;
4925 :
4926 410 : ret = memcmp_extent_buffer(leaf, dentry->d_name.name,
4927 205 : (unsigned long)(ref + 1),
4928 : dentry->d_name.len);
4929 205 : if (ret)
4930 : goto out;
4931 :
4932 205 : btrfs_release_path(path);
4933 :
4934 205 : new_root = btrfs_read_fs_root_no_name(root->fs_info, location);
4935 205 : if (IS_ERR(new_root)) {
4936 0 : err = PTR_ERR(new_root);
4937 : goto out;
4938 : }
4939 :
4940 205 : *sub_root = new_root;
4941 205 : location->objectid = btrfs_root_dirid(&new_root->root_item);
4942 205 : location->type = BTRFS_INODE_ITEM_KEY;
4943 205 : location->offset = 0;
4944 : err = 0;
4945 : out:
4946 205 : btrfs_free_path(path);
4947 205 : return err;
4948 : }
4949 :
4950 25418 : static void inode_tree_add(struct inode *inode)
4951 : {
4952 25418 : struct btrfs_root *root = BTRFS_I(inode)->root;
4953 : struct btrfs_inode *entry;
4954 : struct rb_node **p;
4955 : struct rb_node *parent;
4956 25418 : struct rb_node *new = &BTRFS_I(inode)->rb_node;
4957 : u64 ino = btrfs_ino(inode);
4958 :
4959 25418 : if (inode_unhashed(inode))
4960 : return;
4961 : parent = NULL;
4962 : spin_lock(&root->inode_lock);
4963 25418 : p = &root->inode_tree.rb_node;
4964 377160 : while (*p) {
4965 : parent = *p;
4966 : entry = rb_entry(parent, struct btrfs_inode, rb_node);
4967 :
4968 326324 : if (ino < btrfs_ino(&entry->vfs_inode))
4969 9541 : p = &parent->rb_left;
4970 316783 : else if (ino > btrfs_ino(&entry->vfs_inode))
4971 316783 : p = &parent->rb_right;
4972 : else {
4973 0 : WARN_ON(!(entry->vfs_inode.i_state &
4974 : (I_WILL_FREE | I_FREEING)));
4975 0 : rb_replace_node(parent, new, &root->inode_tree);
4976 0 : RB_CLEAR_NODE(parent);
4977 : spin_unlock(&root->inode_lock);
4978 : return;
4979 : }
4980 : }
4981 : rb_link_node(new, parent, p);
4982 25418 : rb_insert_color(new, &root->inode_tree);
4983 : spin_unlock(&root->inode_lock);
4984 : }
4985 :
4986 25704 : static void inode_tree_del(struct inode *inode)
4987 : {
4988 25704 : struct btrfs_root *root = BTRFS_I(inode)->root;
4989 : int empty = 0;
4990 :
4991 : spin_lock(&root->inode_lock);
4992 25704 : if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
4993 25411 : rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
4994 25411 : RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
4995 25411 : empty = RB_EMPTY_ROOT(&root->inode_tree);
4996 : }
4997 : spin_unlock(&root->inode_lock);
4998 :
4999 26392 : if (empty && btrfs_root_refs(&root->root_item) == 0) {
5000 33 : synchronize_srcu(&root->fs_info->subvol_srcu);
5001 : spin_lock(&root->inode_lock);
5002 33 : empty = RB_EMPTY_ROOT(&root->inode_tree);
5003 : spin_unlock(&root->inode_lock);
5004 33 : if (empty)
5005 33 : btrfs_add_dead_root(root);
5006 : }
5007 25704 : }
5008 :
5009 33 : void btrfs_invalidate_inodes(struct btrfs_root *root)
5010 : {
5011 : struct rb_node *node;
5012 : struct rb_node *prev;
5013 : struct btrfs_inode *entry;
5014 : struct inode *inode;
5015 : u64 objectid = 0;
5016 :
5017 66 : if (!test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
5018 33 : WARN_ON(btrfs_root_refs(&root->root_item) != 0);
5019 :
5020 : spin_lock(&root->inode_lock);
5021 : again:
5022 88 : node = root->inode_tree.rb_node;
5023 : prev = NULL;
5024 258 : while (node) {
5025 : prev = node;
5026 : entry = rb_entry(node, struct btrfs_inode, rb_node);
5027 :
5028 100 : if (objectid < btrfs_ino(&entry->vfs_inode))
5029 42 : node = node->rb_left;
5030 58 : else if (objectid > btrfs_ino(&entry->vfs_inode))
5031 40 : node = node->rb_right;
5032 : else
5033 : break;
5034 : }
5035 88 : if (!node) {
5036 107 : while (prev) {
5037 : entry = rb_entry(prev, struct btrfs_inode, rb_node);
5038 74 : if (objectid <= btrfs_ino(&entry->vfs_inode)) {
5039 : node = prev;
5040 : break;
5041 : }
5042 37 : prev = rb_next(prev);
5043 : }
5044 : }
5045 88 : while (node) {
5046 : entry = rb_entry(node, struct btrfs_inode, rb_node);
5047 55 : objectid = btrfs_ino(&entry->vfs_inode) + 1;
5048 55 : inode = igrab(&entry->vfs_inode);
5049 55 : if (inode) {
5050 : spin_unlock(&root->inode_lock);
5051 55 : if (atomic_read(&inode->i_count) > 1)
5052 33 : d_prune_aliases(inode);
5053 : /*
5054 : * btrfs_drop_inode will have it removed from
5055 : * the inode cache when its usage count
5056 : * hits zero.
5057 : */
5058 55 : iput(inode);
5059 55 : cond_resched();
5060 : spin_lock(&root->inode_lock);
5061 : goto again;
5062 : }
5063 :
5064 0 : if (cond_resched_lock(&root->inode_lock))
5065 : goto again;
5066 :
5067 0 : node = rb_next(node);
5068 : }
5069 : spin_unlock(&root->inode_lock);
5070 33 : }
5071 :
5072 5018 : static int btrfs_init_locked_inode(struct inode *inode, void *p)
5073 : {
5074 : struct btrfs_iget_args *args = p;
5075 5018 : inode->i_ino = args->location->objectid;
5076 5018 : memcpy(&BTRFS_I(inode)->location, args->location,
5077 : sizeof(*args->location));
5078 5018 : BTRFS_I(inode)->root = args->root;
5079 5018 : return 0;
5080 : }
5081 :
5082 31256 : static int btrfs_find_actor(struct inode *inode, void *opaque)
5083 : {
5084 : struct btrfs_iget_args *args = opaque;
5085 62449 : return args->location->objectid == BTRFS_I(inode)->location.objectid &&
5086 31193 : args->root == BTRFS_I(inode)->root;
5087 : }
5088 :
5089 36211 : static struct inode *btrfs_iget_locked(struct super_block *s,
5090 : struct btrfs_key *location,
5091 36211 : struct btrfs_root *root)
5092 : {
5093 : struct inode *inode;
5094 : struct btrfs_iget_args args;
5095 36211 : unsigned long hashval = btrfs_inode_hash(location->objectid, root);
5096 :
5097 36211 : args.location = location;
5098 36211 : args.root = root;
5099 :
5100 36211 : inode = iget5_locked(s, hashval, btrfs_find_actor,
5101 : btrfs_init_locked_inode,
5102 : (void *)&args);
5103 36211 : return inode;
5104 : }
5105 :
5106 : /* Get an inode object given its location and corresponding root.
5107 : * Returns in *is_new if the inode was read from disk
5108 : */
5109 36211 : struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
5110 : struct btrfs_root *root, int *new)
5111 : {
5112 : struct inode *inode;
5113 :
5114 36211 : inode = btrfs_iget_locked(s, location, root);
5115 36211 : if (!inode)
5116 : return ERR_PTR(-ENOMEM);
5117 :
5118 36211 : if (inode->i_state & I_NEW) {
5119 5018 : btrfs_read_locked_inode(inode);
5120 5018 : if (!is_bad_inode(inode)) {
5121 4946 : inode_tree_add(inode);
5122 4946 : unlock_new_inode(inode);
5123 4946 : if (new)
5124 3 : *new = 1;
5125 : } else {
5126 72 : unlock_new_inode(inode);
5127 72 : iput(inode);
5128 : inode = ERR_PTR(-ESTALE);
5129 : }
5130 : }
5131 :
5132 36211 : return inode;
5133 : }
5134 :
5135 0 : static struct inode *new_simple_dir(struct super_block *s,
5136 : struct btrfs_key *key,
5137 : struct btrfs_root *root)
5138 : {
5139 0 : struct inode *inode = new_inode(s);
5140 :
5141 0 : if (!inode)
5142 : return ERR_PTR(-ENOMEM);
5143 :
5144 0 : BTRFS_I(inode)->root = root;
5145 0 : memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
5146 : set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
5147 :
5148 0 : inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
5149 0 : inode->i_op = &btrfs_dir_ro_inode_operations;
5150 0 : inode->i_fop = &simple_dir_operations;
5151 0 : inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
5152 0 : inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
5153 :
5154 0 : return inode;
5155 : }
5156 :
5157 81537 : struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
5158 : {
5159 : struct inode *inode;
5160 40768 : struct btrfs_root *root = BTRFS_I(dir)->root;
5161 40768 : struct btrfs_root *sub_root = root;
5162 : struct btrfs_key location;
5163 : int index;
5164 : int ret = 0;
5165 :
5166 40768 : if (dentry->d_name.len > BTRFS_NAME_LEN)
5167 : return ERR_PTR(-ENAMETOOLONG);
5168 :
5169 40769 : ret = btrfs_inode_by_name(dir, dentry, &location);
5170 40769 : if (ret < 0)
5171 0 : return ERR_PTR(ret);
5172 :
5173 40769 : if (location.objectid == 0)
5174 : return ERR_PTR(-ENOENT);
5175 :
5176 12304 : if (location.type == BTRFS_INODE_ITEM_KEY) {
5177 12099 : inode = btrfs_iget(dir->i_sb, &location, root, NULL);
5178 12099 : return inode;
5179 : }
5180 :
5181 205 : BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY);
5182 :
5183 205 : index = srcu_read_lock(&root->fs_info->subvol_srcu);
5184 205 : ret = fixup_tree_root_location(root, dir, dentry,
5185 : &location, &sub_root);
5186 205 : if (ret < 0) {
5187 0 : if (ret != -ENOENT)
5188 0 : inode = ERR_PTR(ret);
5189 : else
5190 0 : inode = new_simple_dir(dir->i_sb, &location, sub_root);
5191 : } else {
5192 205 : inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL);
5193 : }
5194 205 : srcu_read_unlock(&root->fs_info->subvol_srcu, index);
5195 :
5196 205 : if (!IS_ERR(inode) && root != sub_root) {
5197 205 : down_read(&root->fs_info->cleanup_work_sem);
5198 205 : if (!(inode->i_sb->s_flags & MS_RDONLY))
5199 205 : ret = btrfs_orphan_cleanup(sub_root);
5200 205 : up_read(&root->fs_info->cleanup_work_sem);
5201 205 : if (ret) {
5202 0 : iput(inode);
5203 0 : inode = ERR_PTR(ret);
5204 : }
5205 : /*
5206 : * If orphan cleanup did remove any orphans, it means the tree
5207 : * was modified and therefore the commit root is not the same as
5208 : * the current root anymore. This is a problem, because send
5209 : * uses the commit root and therefore can see inode items that
5210 : * don't exist in the current root anymore, and for example make
5211 : * calls to btrfs_iget, which will do tree lookups based on the
5212 : * current root and not on the commit root. Those lookups will
5213 : * fail, returning a -ESTALE error, and making send fail with
5214 : * that error. So make sure a send does not see any orphans we
5215 : * have just removed, and that it will see the same inodes
5216 : * regardless of whether a transaction commit happened before
5217 : * it started (meaning that the commit root will be the same as
5218 : * the current root) or not.
5219 : */
5220 205 : if (sub_root->node != sub_root->commit_root) {
5221 : u64 sub_flags = btrfs_root_flags(&sub_root->root_item);
5222 :
5223 6 : if (sub_flags & BTRFS_ROOT_SUBVOL_RDONLY) {
5224 : struct extent_buffer *eb;
5225 :
5226 : /*
5227 : * Assert we can't have races between dentry
5228 : * lookup called through the snapshot creation
5229 : * ioctl and the VFS.
5230 : */
5231 : ASSERT(mutex_is_locked(&dir->i_mutex));
5232 :
5233 1 : down_write(&root->fs_info->commit_root_sem);
5234 1 : eb = sub_root->commit_root;
5235 1 : sub_root->commit_root =
5236 1 : btrfs_root_node(sub_root);
5237 1 : up_write(&root->fs_info->commit_root_sem);
5238 1 : free_extent_buffer(eb);
5239 : }
5240 : }
5241 : }
5242 :
5243 205 : return inode;
5244 : }
5245 :
5246 1454590 : static int btrfs_dentry_delete(const struct dentry *dentry)
5247 : {
5248 : struct btrfs_root *root;
5249 1454590 : struct inode *inode = dentry->d_inode;
5250 :
5251 1454590 : if (!inode && !IS_ROOT(dentry))
5252 24597 : inode = dentry->d_parent->d_inode;
5253 :
5254 1454590 : if (inode) {
5255 1454595 : root = BTRFS_I(inode)->root;
5256 1454595 : if (btrfs_root_refs(&root->root_item) == 0)
5257 : return 1;
5258 :
5259 1454600 : if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
5260 : return 1;
5261 : }
5262 1454613 : return 0;
5263 : }
5264 :
5265 39072 : static void btrfs_dentry_release(struct dentry *dentry)
5266 : {
5267 39072 : kfree(dentry->d_fsdata);
5268 39072 : }
5269 :
5270 40574 : static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
5271 : unsigned int flags)
5272 : {
5273 : struct inode *inode;
5274 :
5275 40574 : inode = btrfs_lookup_dentry(dir, dentry);
5276 40574 : if (IS_ERR(inode)) {
5277 28466 : if (PTR_ERR(inode) == -ENOENT)
5278 : inode = NULL;
5279 : else
5280 : return ERR_CAST(inode);
5281 : }
5282 :
5283 40574 : return d_materialise_unique(dentry, inode);
5284 : }
5285 :
5286 : unsigned char btrfs_filetype_table[] = {
5287 : DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
5288 : };
5289 :
5290 22061 : static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
5291 : {
5292 : struct inode *inode = file_inode(file);
5293 22061 : struct btrfs_root *root = BTRFS_I(inode)->root;
5294 : struct btrfs_item *item;
5295 : struct btrfs_dir_item *di;
5296 : struct btrfs_key key;
5297 : struct btrfs_key found_key;
5298 : struct btrfs_path *path;
5299 : struct list_head ins_list;
5300 : struct list_head del_list;
5301 : int ret;
5302 169847 : struct extent_buffer *leaf;
5303 : int slot;
5304 : unsigned char d_type;
5305 : int over = 0;
5306 : u32 di_cur;
5307 : u32 di_total;
5308 : u32 di_len;
5309 : int key_type = BTRFS_DIR_INDEX_KEY;
5310 : char tmp_name[32];
5311 : char *name_ptr;
5312 : int name_len;
5313 : int is_curr = 0; /* ctx->pos points to the current index? */
5314 :
5315 : /* FIXME, use a real flag for deciding about the key type */
5316 22061 : if (root->fs_info->tree_root == root)
5317 : key_type = BTRFS_DIR_ITEM_KEY;
5318 :
5319 22061 : if (!dir_emit_dots(file, ctx))
5320 : return 0;
5321 :
5322 22061 : path = btrfs_alloc_path();
5323 22061 : if (!path)
5324 : return -ENOMEM;
5325 :
5326 22061 : path->reada = 1;
5327 :
5328 22061 : if (key_type == BTRFS_DIR_INDEX_KEY) {
5329 : INIT_LIST_HEAD(&ins_list);
5330 : INIT_LIST_HEAD(&del_list);
5331 22061 : btrfs_get_delayed_items(inode, &ins_list, &del_list);
5332 : }
5333 :
5334 22061 : btrfs_set_key_type(&key, key_type);
5335 22061 : key.offset = ctx->pos;
5336 22061 : key.objectid = btrfs_ino(inode);
5337 :
5338 22061 : ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5339 22061 : if (ret < 0)
5340 : goto err;
5341 :
5342 : while (1) {
5343 169847 : leaf = path->nodes[0];
5344 169847 : slot = path->slots[0];
5345 339694 : if (slot >= btrfs_header_nritems(leaf)) {
5346 1392 : ret = btrfs_next_leaf(root, path);
5347 1392 : if (ret < 0)
5348 : goto err;
5349 1392 : else if (ret > 0)
5350 : break;
5351 1313 : continue;
5352 : }
5353 :
5354 : item = btrfs_item_nr(slot);
5355 168455 : btrfs_item_key_to_cpu(leaf, &found_key, slot);
5356 :
5357 168455 : if (found_key.objectid != key.objectid)
5358 : break;
5359 146477 : if (btrfs_key_type(&found_key) != key_type)
5360 : break;
5361 146477 : if (found_key.offset < ctx->pos)
5362 : goto next;
5363 292954 : if (key_type == BTRFS_DIR_INDEX_KEY &&
5364 146477 : btrfs_should_delete_dir_index(&del_list,
5365 : found_key.offset))
5366 : goto next;
5367 :
5368 146360 : ctx->pos = found_key.offset;
5369 : is_curr = 1;
5370 :
5371 146360 : di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
5372 : di_cur = 0;
5373 : di_total = btrfs_item_size(leaf, item);
5374 :
5375 146360 : while (di_cur < di_total) {
5376 : struct btrfs_key location;
5377 :
5378 146360 : if (verify_dir_item(root, leaf, di))
5379 : break;
5380 :
5381 146360 : name_len = btrfs_dir_name_len(leaf, di);
5382 146360 : if (name_len <= sizeof(tmp_name)) {
5383 : name_ptr = tmp_name;
5384 : } else {
5385 523 : name_ptr = kmalloc(name_len, GFP_NOFS);
5386 523 : if (!name_ptr) {
5387 : ret = -ENOMEM;
5388 0 : goto err;
5389 : }
5390 : }
5391 292720 : read_extent_buffer(leaf, name_ptr,
5392 146360 : (unsigned long)(di + 1), name_len);
5393 :
5394 146360 : d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
5395 146360 : btrfs_dir_item_key_to_cpu(leaf, di, &location);
5396 :
5397 :
5398 : /* is this a reference to our own snapshot? If so
5399 : * skip it.
5400 : *
5401 : * In contrast to old kernels, we insert the snapshot's
5402 : * dir item and dir index after it has been created, so
5403 : * we won't find a reference to our own snapshot. We
5404 : * still keep the following code for backward
5405 : * compatibility.
5406 : */
5407 146382 : if (location.type == BTRFS_ROOT_ITEM_KEY &&
5408 22 : location.objectid == root->root_key.objectid) {
5409 : over = 0;
5410 : goto skip;
5411 : }
5412 292720 : over = !dir_emit(ctx, name_ptr, name_len,
5413 : location.objectid, d_type);
5414 :
5415 : skip:
5416 146360 : if (name_ptr != tmp_name)
5417 523 : kfree(name_ptr);
5418 :
5419 146360 : if (over)
5420 : goto nopos;
5421 292712 : di_len = btrfs_dir_name_len(leaf, di) +
5422 : btrfs_dir_data_len(leaf, di) + sizeof(*di);
5423 146356 : di_cur += di_len;
5424 146356 : di = (struct btrfs_dir_item *)((char *)di + di_len);
5425 : }
5426 : next:
5427 146473 : path->slots[0]++;
5428 : }
5429 :
5430 22057 : if (key_type == BTRFS_DIR_INDEX_KEY) {
5431 22057 : if (is_curr)
5432 11155 : ctx->pos++;
5433 22057 : ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
5434 22057 : if (ret)
5435 : goto nopos;
5436 : }
5437 :
5438 : /* Reached end of directory/root. Bump pos past the last item. */
5439 22057 : ctx->pos++;
5440 :
5441 : /*
5442 : * Stop new entries from being returned after we return the last
5443 : * entry.
5444 : *
5445 : * New directory entries are assigned a strictly increasing
5446 : * offset. This means that new entries created during readdir
5447 : * are *guaranteed* to be seen in the future by that readdir.
5448 : * This has broken buggy programs which operate on names as
5449 : * they're returned by readdir. Until we re-use freed offsets
5450 : * we have this hack to stop new entries from being returned
5451 : * under the assumption that they'll never reach this huge
5452 : * offset.
5453 : *
5454 : * This is being careful not to overflow 32bit loff_t unless the
5455 : * last entry requires it because doing so has broken 32bit apps
5456 : * in the past.
5457 : */
5458 22057 : if (key_type == BTRFS_DIR_INDEX_KEY) {
5459 22057 : if (ctx->pos >= INT_MAX)
5460 10479 : ctx->pos = LLONG_MAX;
5461 : else
5462 11578 : ctx->pos = INT_MAX;
5463 : }
5464 : nopos:
5465 : ret = 0;
5466 : err:
5467 22061 : if (key_type == BTRFS_DIR_INDEX_KEY)
5468 22061 : btrfs_put_delayed_items(&ins_list, &del_list);
5469 22061 : btrfs_free_path(path);
5470 22061 : return ret;
5471 : }
5472 :
5473 0 : int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
5474 : {
5475 0 : struct btrfs_root *root = BTRFS_I(inode)->root;
5476 : struct btrfs_trans_handle *trans;
5477 : int ret = 0;
5478 : bool nolock = false;
5479 :
5480 0 : if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
5481 : return 0;
5482 :
5483 0 : if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(inode))
5484 : nolock = true;
5485 :
5486 0 : if (wbc->sync_mode == WB_SYNC_ALL) {
5487 0 : if (nolock)
5488 0 : trans = btrfs_join_transaction_nolock(root);
5489 : else
5490 0 : trans = btrfs_join_transaction(root);
5491 0 : if (IS_ERR(trans))
5492 0 : return PTR_ERR(trans);
5493 0 : ret = btrfs_commit_transaction(trans, root);
5494 : }
5495 0 : return ret;
5496 : }
5497 :
5498 : /*
5499 : * This is somewhat expensive, updating the tree every time the
5500 : * inode changes. But, it is most likely to find the inode in cache.
5501 : * FIXME, needs more benchmarking...there are no reasons other than performance
5502 : * to keep or drop this code.
5503 : */
5504 13216 : static int btrfs_dirty_inode(struct inode *inode)
5505 : {
5506 13216 : struct btrfs_root *root = BTRFS_I(inode)->root;
5507 : struct btrfs_trans_handle *trans;
5508 : int ret;
5509 :
5510 13216 : if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
5511 : return 0;
5512 :
5513 13216 : trans = btrfs_join_transaction(root);
5514 13216 : if (IS_ERR(trans))
5515 0 : return PTR_ERR(trans);
5516 :
5517 13216 : ret = btrfs_update_inode(trans, root, inode);
5518 13216 : if (ret && ret == -ENOSPC) {
5519 : /* whoops, lets try again with the full transaction */
5520 0 : btrfs_end_transaction(trans, root);
5521 0 : trans = btrfs_start_transaction(root, 1);
5522 0 : if (IS_ERR(trans))
5523 0 : return PTR_ERR(trans);
5524 :
5525 0 : ret = btrfs_update_inode(trans, root, inode);
5526 : }
5527 13216 : btrfs_end_transaction(trans, root);
5528 13216 : if (BTRFS_I(inode)->delayed_node)
5529 13216 : btrfs_balance_delayed_items(root);
5530 :
5531 13216 : return ret;
5532 : }
5533 :
5534 : /*
5535 : * This is a copy of file_update_time. We need this so we can return error on
5536 : * ENOSPC for updating the inode in the case of file write and mmap writes.
5537 : */
5538 46157 : static int btrfs_update_time(struct inode *inode, struct timespec *now,
5539 : int flags)
5540 : {
5541 46157 : struct btrfs_root *root = BTRFS_I(inode)->root;
5542 :
5543 46157 : if (btrfs_root_readonly(root))
5544 : return -EROFS;
5545 :
5546 5146 : if (flags & S_VERSION)
5547 : inode_inc_iversion(inode);
5548 5146 : if (flags & S_CTIME)
5549 1544 : inode->i_ctime = *now;
5550 5146 : if (flags & S_MTIME)
5551 1544 : inode->i_mtime = *now;
5552 5146 : if (flags & S_ATIME)
5553 3602 : inode->i_atime = *now;
5554 5146 : return btrfs_dirty_inode(inode);
5555 : }
5556 :
5557 : /*
5558 : * find the highest existing sequence number in a directory
5559 : * and then set the in-memory index_cnt variable to reflect
5560 : * free sequence numbers
5561 : */
5562 185 : static int btrfs_set_inode_index_count(struct inode *inode)
5563 : {
5564 185 : struct btrfs_root *root = BTRFS_I(inode)->root;
5565 : struct btrfs_key key, found_key;
5566 : struct btrfs_path *path;
5567 : struct extent_buffer *leaf;
5568 : int ret;
5569 :
5570 185 : key.objectid = btrfs_ino(inode);
5571 : btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
5572 185 : key.offset = (u64)-1;
5573 :
5574 185 : path = btrfs_alloc_path();
5575 185 : if (!path)
5576 : return -ENOMEM;
5577 :
5578 185 : ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5579 185 : if (ret < 0)
5580 : goto out;
5581 : /* FIXME: we should be able to handle this */
5582 185 : if (ret == 0)
5583 : goto out;
5584 : ret = 0;
5585 :
5586 : /*
5587 : * MAGIC NUMBER EXPLANATION:
5588 : * since we search a directory based on f_pos we have to start at 2
5589 : * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody
5590 : * else has to start at 2
5591 : */
5592 185 : if (path->slots[0] == 0) {
5593 0 : BTRFS_I(inode)->index_cnt = 2;
5594 0 : goto out;
5595 : }
5596 :
5597 185 : path->slots[0]--;
5598 :
5599 185 : leaf = path->nodes[0];
5600 185 : btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5601 :
5602 555 : if (found_key.objectid != btrfs_ino(inode) ||
5603 185 : btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) {
5604 112 : BTRFS_I(inode)->index_cnt = 2;
5605 112 : goto out;
5606 : }
5607 :
5608 73 : BTRFS_I(inode)->index_cnt = found_key.offset + 1;
5609 : out:
5610 185 : btrfs_free_path(path);
5611 185 : return ret;
5612 : }
5613 :
5614 : /*
5615 : * helper to find a free sequence number in a given directory. This current
5616 : * code is very simple, later versions will do smarter things in the btree
5617 : */
5618 26501 : int btrfs_set_inode_index(struct inode *dir, u64 *index)
5619 : {
5620 : int ret = 0;
5621 :
5622 26501 : if (BTRFS_I(dir)->index_cnt == (u64)-1) {
5623 185 : ret = btrfs_inode_delayed_dir_index_count(dir);
5624 185 : if (ret) {
5625 185 : ret = btrfs_set_inode_index_count(dir);
5626 185 : if (ret)
5627 : return ret;
5628 : }
5629 : }
5630 :
5631 26501 : *index = BTRFS_I(dir)->index_cnt;
5632 26501 : BTRFS_I(dir)->index_cnt++;
5633 :
5634 26501 : return ret;
5635 : }
5636 :
5637 20472 : static int btrfs_insert_inode_locked(struct inode *inode)
5638 : {
5639 : struct btrfs_iget_args args;
5640 20472 : args.location = &BTRFS_I(inode)->location;
5641 20472 : args.root = BTRFS_I(inode)->root;
5642 :
5643 40944 : return insert_inode_locked4(inode,
5644 20472 : btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root),
5645 : btrfs_find_actor, &args);
5646 : }
5647 :
5648 40944 : static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
5649 : struct btrfs_root *root,
5650 : struct inode *dir,
5651 : const char *name, int name_len,
5652 : u64 ref_objectid, u64 objectid,
5653 : umode_t mode, u64 *index)
5654 : {
5655 : struct inode *inode;
5656 : struct btrfs_inode_item *inode_item;
5657 : struct btrfs_key *location;
5658 : struct btrfs_path *path;
5659 : struct btrfs_inode_ref *ref;
5660 : struct btrfs_key key[2];
5661 : u32 sizes[2];
5662 20472 : int nitems = name ? 2 : 1;
5663 : unsigned long ptr;
5664 : int ret;
5665 :
5666 20472 : path = btrfs_alloc_path();
5667 20472 : if (!path)
5668 : return ERR_PTR(-ENOMEM);
5669 :
5670 20472 : inode = new_inode(root->fs_info->sb);
5671 20472 : if (!inode) {
5672 0 : btrfs_free_path(path);
5673 0 : return ERR_PTR(-ENOMEM);
5674 : }
5675 :
5676 : /*
5677 : * O_TMPFILE, set link count to 0, so that after this point,
5678 : * we fill in an inode item with the correct link count.
5679 : */
5680 20472 : if (!name)
5681 2 : set_nlink(inode, 0);
5682 :
5683 : /*
5684 : * we have to initialize this early, so we can reclaim the inode
5685 : * number if we fail afterwards in this function.
5686 : */
5687 20472 : inode->i_ino = objectid;
5688 :
5689 20472 : if (dir && name) {
5690 20421 : trace_btrfs_inode_request(dir);
5691 :
5692 20421 : ret = btrfs_set_inode_index(dir, index);
5693 20421 : if (ret) {
5694 0 : btrfs_free_path(path);
5695 0 : iput(inode);
5696 0 : return ERR_PTR(ret);
5697 : }
5698 51 : } else if (dir) {
5699 2 : *index = 0;
5700 : }
5701 : /*
5702 : * index_cnt is ignored for everything but a dir,
5703 : * btrfs_get_inode_index_count has an explanation for the magic
5704 : * number
5705 : */
5706 20472 : BTRFS_I(inode)->index_cnt = 2;
5707 20472 : BTRFS_I(inode)->dir_index = *index;
5708 20472 : BTRFS_I(inode)->root = root;
5709 20472 : BTRFS_I(inode)->generation = trans->transid;
5710 20472 : inode->i_generation = BTRFS_I(inode)->generation;
5711 :
5712 : /*
5713 : * We could have gotten an inode number from somebody who was fsynced
5714 : * and then removed in this same transaction, so let's just set full
5715 : * sync since it will be a full sync anyway and this will blow away the
5716 : * old info in the log.
5717 : */
5718 : set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
5719 :
5720 20472 : key[0].objectid = objectid;
5721 : btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
5722 20472 : key[0].offset = 0;
5723 :
5724 20472 : sizes[0] = sizeof(struct btrfs_inode_item);
5725 :
5726 20472 : if (name) {
5727 : /*
5728 : * Start new inodes with an inode_ref. This is slightly more
5729 : * efficient for small numbers of hard links since they will
5730 : * be packed into one item. Extended refs will kick in if we
5731 : * add more hard links than can fit in the ref item.
5732 : */
5733 20470 : key[1].objectid = objectid;
5734 : btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
5735 20470 : key[1].offset = ref_objectid;
5736 :
5737 20470 : sizes[1] = name_len + sizeof(*ref);
5738 : }
5739 :
5740 : location = &BTRFS_I(inode)->location;
5741 20472 : location->objectid = objectid;
5742 20472 : location->offset = 0;
5743 : btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
5744 :
5745 20472 : ret = btrfs_insert_inode_locked(inode);
5746 20472 : if (ret < 0)
5747 : goto fail;
5748 :
5749 20472 : path->leave_spinning = 1;
5750 20472 : ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems);
5751 20472 : if (ret != 0)
5752 : goto fail_unlock;
5753 :
5754 20472 : inode_init_owner(inode, dir, mode);
5755 20472 : inode_set_bytes(inode, 0);
5756 20472 : inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
5757 40944 : inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
5758 : struct btrfs_inode_item);
5759 20472 : memset_extent_buffer(path->nodes[0], 0, (unsigned long)inode_item,
5760 : sizeof(*inode_item));
5761 20472 : fill_inode_item(trans, path->nodes[0], inode_item, inode);
5762 :
5763 20472 : if (name) {
5764 40940 : ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
5765 : struct btrfs_inode_ref);
5766 20470 : btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
5767 20470 : btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
5768 20470 : ptr = (unsigned long)(ref + 1);
5769 20470 : write_extent_buffer(path->nodes[0], name, ptr, name_len);
5770 : }
5771 :
5772 20472 : btrfs_mark_buffer_dirty(path->nodes[0]);
5773 20472 : btrfs_free_path(path);
5774 :
5775 20472 : btrfs_inherit_iflags(inode, dir);
5776 :
5777 20472 : if (S_ISREG(mode)) {
5778 13866 : if (btrfs_test_opt(root, NODATASUM))
5779 1 : BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
5780 13866 : if (btrfs_test_opt(root, NODATACOW))
5781 1 : BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
5782 : BTRFS_INODE_NODATASUM;
5783 : }
5784 :
5785 20472 : inode_tree_add(inode);
5786 :
5787 20472 : trace_btrfs_inode_new(inode);
5788 : btrfs_set_inode_last_trans(trans, inode);
5789 :
5790 20472 : btrfs_update_root_times(trans, root);
5791 :
5792 20472 : ret = btrfs_inode_inherit_props(trans, inode, dir);
5793 20472 : if (ret)
5794 0 : btrfs_err(root->fs_info,
5795 : "error inheriting props for ino %llu (root %llu): %d",
5796 : btrfs_ino(inode), root->root_key.objectid, ret);
5797 :
5798 20472 : return inode;
5799 :
5800 : fail_unlock:
5801 0 : unlock_new_inode(inode);
5802 : fail:
5803 0 : if (dir && name)
5804 0 : BTRFS_I(dir)->index_cnt--;
5805 0 : btrfs_free_path(path);
5806 0 : iput(inode);
5807 0 : return ERR_PTR(ret);
5808 : }
5809 :
5810 : static inline u8 btrfs_inode_type(struct inode *inode)
5811 : {
5812 26306 : return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT];
5813 : }
5814 :
5815 : /*
5816 : * utility function to add 'inode' into 'parent_inode' with
5817 : * a give name and a given sequence number.
5818 : * if 'add_backref' is true, also insert a backref from the
5819 : * inode to the parent directory.
5820 : */
5821 26306 : int btrfs_add_link(struct btrfs_trans_handle *trans,
5822 26306 : struct inode *parent_inode, struct inode *inode,
5823 : const char *name, int name_len, int add_backref, u64 index)
5824 : {
5825 : int ret = 0;
5826 : struct btrfs_key key;
5827 26306 : struct btrfs_root *root = BTRFS_I(parent_inode)->root;
5828 : u64 ino = btrfs_ino(inode);
5829 : u64 parent_ino = btrfs_ino(parent_inode);
5830 :
5831 26306 : if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
5832 1 : memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key));
5833 : } else {
5834 26305 : key.objectid = ino;
5835 : btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
5836 26305 : key.offset = 0;
5837 : }
5838 :
5839 26306 : if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
5840 1 : ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
5841 : key.objectid, root->root_key.objectid,
5842 : parent_ino, index, name, name_len);
5843 26305 : } else if (add_backref) {
5844 3573 : ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino,
5845 : parent_ino, index);
5846 : }
5847 :
5848 : /* Nothing to clean up yet */
5849 26306 : if (ret)
5850 : return ret;
5851 :
5852 26306 : ret = btrfs_insert_dir_item(trans, root, name, name_len,
5853 : parent_inode, &key,
5854 : btrfs_inode_type(inode), index);
5855 26306 : if (ret == -EEXIST || ret == -EOVERFLOW)
5856 : goto fail_dir_item;
5857 26306 : else if (ret) {
5858 0 : btrfs_abort_transaction(trans, root, ret);
5859 0 : return ret;
5860 : }
5861 :
5862 52612 : btrfs_i_size_write(parent_inode, parent_inode->i_size +
5863 26306 : name_len * 2);
5864 : inode_inc_iversion(parent_inode);
5865 26306 : parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
5866 26306 : ret = btrfs_update_inode(trans, root, parent_inode);
5867 26306 : if (ret)
5868 0 : btrfs_abort_transaction(trans, root, ret);
5869 26306 : return ret;
5870 :
5871 : fail_dir_item:
5872 0 : if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
5873 : u64 local_index;
5874 : int err;
5875 0 : err = btrfs_del_root_ref(trans, root->fs_info->tree_root,
5876 : key.objectid, root->root_key.objectid,
5877 : parent_ino, &local_index, name, name_len);
5878 :
5879 0 : } else if (add_backref) {
5880 : u64 local_index;
5881 : int err;
5882 :
5883 0 : err = btrfs_del_inode_ref(trans, root, name, name_len,
5884 : ino, parent_ino, &local_index);
5885 : }
5886 0 : return ret;
5887 : }
5888 :
5889 : static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
5890 : struct inode *dir, struct dentry *dentry,
5891 : struct inode *inode, int backref, u64 index)
5892 : {
5893 21581 : int err = btrfs_add_link(trans, dir, inode,
5894 : dentry->d_name.name, dentry->d_name.len,
5895 : backref, index);
5896 21581 : if (err > 0)
5897 : err = -EEXIST;
5898 : return err;
5899 : }
5900 :
5901 4194 : static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
5902 : umode_t mode, dev_t rdev)
5903 : {
5904 : struct btrfs_trans_handle *trans;
5905 2097 : struct btrfs_root *root = BTRFS_I(dir)->root;
5906 : struct inode *inode = NULL;
5907 : int err;
5908 : int drop_inode = 0;
5909 : u64 objectid;
5910 2097 : u64 index = 0;
5911 :
5912 : if (!new_valid_dev(rdev))
5913 : return -EINVAL;
5914 :
5915 : /*
5916 : * 2 for inode item and ref
5917 : * 2 for dir items
5918 : * 1 for xattr if selinux is on
5919 : */
5920 2097 : trans = btrfs_start_transaction(root, 5);
5921 2097 : if (IS_ERR(trans))
5922 0 : return PTR_ERR(trans);
5923 :
5924 2097 : err = btrfs_find_free_ino(root, &objectid);
5925 2097 : if (err)
5926 : goto out_unlock;
5927 :
5928 4194 : inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
5929 2097 : dentry->d_name.len, btrfs_ino(dir), objectid,
5930 : mode, &index);
5931 2097 : if (IS_ERR(inode)) {
5932 0 : err = PTR_ERR(inode);
5933 0 : goto out_unlock;
5934 : }
5935 :
5936 : /*
5937 : * If the active LSM wants to access the inode during
5938 : * d_instantiate it needs these. Smack checks to see
5939 : * if the filesystem supports xattrs by looking at the
5940 : * ops vector.
5941 : */
5942 2097 : inode->i_op = &btrfs_special_inode_operations;
5943 2097 : init_special_inode(inode, inode->i_mode, rdev);
5944 :
5945 2097 : err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
5946 2097 : if (err)
5947 : goto out_unlock_inode;
5948 :
5949 2097 : err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
5950 2097 : if (err) {
5951 : goto out_unlock_inode;
5952 : } else {
5953 2097 : btrfs_update_inode(trans, root, inode);
5954 2097 : unlock_new_inode(inode);
5955 2097 : d_instantiate(dentry, inode);
5956 : }
5957 :
5958 : out_unlock:
5959 2097 : btrfs_end_transaction(trans, root);
5960 2097 : btrfs_balance_delayed_items(root);
5961 2097 : btrfs_btree_balance_dirty(root);
5962 2097 : if (drop_inode) {
5963 : inode_dec_link_count(inode);
5964 0 : iput(inode);
5965 : }
5966 2097 : return err;
5967 :
5968 : out_unlock_inode:
5969 : drop_inode = 1;
5970 0 : unlock_new_inode(inode);
5971 0 : goto out_unlock;
5972 :
5973 : }
5974 :
5975 27728 : static int btrfs_create(struct inode *dir, struct dentry *dentry,
5976 : umode_t mode, bool excl)
5977 : {
5978 : struct btrfs_trans_handle *trans;
5979 13864 : struct btrfs_root *root = BTRFS_I(dir)->root;
5980 : struct inode *inode = NULL;
5981 : int drop_inode_on_err = 0;
5982 : int err;
5983 : u64 objectid;
5984 13864 : u64 index = 0;
5985 :
5986 : /*
5987 : * 2 for inode item and ref
5988 : * 2 for dir items
5989 : * 1 for xattr if selinux is on
5990 : */
5991 13864 : trans = btrfs_start_transaction(root, 5);
5992 13864 : if (IS_ERR(trans))
5993 0 : return PTR_ERR(trans);
5994 :
5995 13864 : err = btrfs_find_free_ino(root, &objectid);
5996 13864 : if (err)
5997 : goto out_unlock;
5998 :
5999 27728 : inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
6000 13864 : dentry->d_name.len, btrfs_ino(dir), objectid,
6001 : mode, &index);
6002 13864 : if (IS_ERR(inode)) {
6003 0 : err = PTR_ERR(inode);
6004 0 : goto out_unlock;
6005 : }
6006 : drop_inode_on_err = 1;
6007 : /*
6008 : * If the active LSM wants to access the inode during
6009 : * d_instantiate it needs these. Smack checks to see
6010 : * if the filesystem supports xattrs by looking at the
6011 : * ops vector.
6012 : */
6013 13864 : inode->i_fop = &btrfs_file_operations;
6014 13864 : inode->i_op = &btrfs_file_inode_operations;
6015 13864 : inode->i_mapping->a_ops = &btrfs_aops;
6016 13864 : inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
6017 :
6018 13864 : err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
6019 13864 : if (err)
6020 : goto out_unlock_inode;
6021 :
6022 13864 : err = btrfs_update_inode(trans, root, inode);
6023 13864 : if (err)
6024 : goto out_unlock_inode;
6025 :
6026 13864 : err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
6027 13864 : if (err)
6028 : goto out_unlock_inode;
6029 :
6030 13864 : BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
6031 13864 : unlock_new_inode(inode);
6032 13864 : d_instantiate(dentry, inode);
6033 :
6034 : out_unlock:
6035 13864 : btrfs_end_transaction(trans, root);
6036 13864 : if (err && drop_inode_on_err) {
6037 : inode_dec_link_count(inode);
6038 0 : iput(inode);
6039 : }
6040 13864 : btrfs_balance_delayed_items(root);
6041 13864 : btrfs_btree_balance_dirty(root);
6042 13864 : return err;
6043 :
6044 : out_unlock_inode:
6045 0 : unlock_new_inode(inode);
6046 0 : goto out_unlock;
6047 :
6048 : }
6049 :
6050 3573 : static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
6051 3573 : struct dentry *dentry)
6052 : {
6053 : struct btrfs_trans_handle *trans;
6054 3573 : struct btrfs_root *root = BTRFS_I(dir)->root;
6055 3573 : struct inode *inode = old_dentry->d_inode;
6056 : u64 index;
6057 : int err;
6058 : int drop_inode = 0;
6059 :
6060 : /* do not allow sys_link's with other subvols of the same device */
6061 3573 : if (root->objectid != BTRFS_I(inode)->root->objectid)
6062 : return -EXDEV;
6063 :
6064 3573 : if (inode->i_nlink >= BTRFS_LINK_MAX)
6065 : return -EMLINK;
6066 :
6067 3573 : err = btrfs_set_inode_index(dir, &index);
6068 3573 : if (err)
6069 : goto fail;
6070 :
6071 : /*
6072 : * 2 items for inode and inode ref
6073 : * 2 items for dir items
6074 : * 1 item for parent inode
6075 : */
6076 3573 : trans = btrfs_start_transaction(root, 5);
6077 3573 : if (IS_ERR(trans)) {
6078 0 : err = PTR_ERR(trans);
6079 0 : goto fail;
6080 : }
6081 :
6082 : /* There are several dir indexes for this inode, clear the cache. */
6083 3573 : BTRFS_I(inode)->dir_index = 0ULL;
6084 3573 : inc_nlink(inode);
6085 : inode_inc_iversion(inode);
6086 3573 : inode->i_ctime = CURRENT_TIME;
6087 3573 : ihold(inode);
6088 : set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
6089 :
6090 3573 : err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index);
6091 :
6092 3573 : if (err) {
6093 : drop_inode = 1;
6094 : } else {
6095 3573 : struct dentry *parent = dentry->d_parent;
6096 3573 : err = btrfs_update_inode(trans, root, inode);
6097 3573 : if (err)
6098 : goto fail;
6099 3573 : if (inode->i_nlink == 1) {
6100 : /*
6101 : * If new hard link count is 1, it's a file created
6102 : * with open(2) O_TMPFILE flag.
6103 : */
6104 1 : err = btrfs_orphan_del(trans, inode);
6105 1 : if (err)
6106 : goto fail;
6107 : }
6108 3573 : d_instantiate(dentry, inode);
6109 3573 : btrfs_log_new_name(trans, inode, NULL, parent);
6110 : }
6111 :
6112 3573 : btrfs_end_transaction(trans, root);
6113 3573 : btrfs_balance_delayed_items(root);
6114 : fail:
6115 3573 : if (drop_inode) {
6116 : inode_dec_link_count(inode);
6117 0 : iput(inode);
6118 : }
6119 3573 : btrfs_btree_balance_dirty(root);
6120 3573 : return err;
6121 : }
6122 :
6123 2413 : static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
6124 : {
6125 : struct inode *inode = NULL;
6126 : struct btrfs_trans_handle *trans;
6127 2413 : struct btrfs_root *root = BTRFS_I(dir)->root;
6128 : int err = 0;
6129 : int drop_on_err = 0;
6130 2413 : u64 objectid = 0;
6131 2413 : u64 index = 0;
6132 :
6133 : /*
6134 : * 2 items for inode and ref
6135 : * 2 items for dir items
6136 : * 1 for xattr if selinux is on
6137 : */
6138 2413 : trans = btrfs_start_transaction(root, 5);
6139 2413 : if (IS_ERR(trans))
6140 0 : return PTR_ERR(trans);
6141 :
6142 2413 : err = btrfs_find_free_ino(root, &objectid);
6143 2413 : if (err)
6144 : goto out_fail;
6145 :
6146 4826 : inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
6147 2413 : dentry->d_name.len, btrfs_ino(dir), objectid,
6148 : S_IFDIR | mode, &index);
6149 2413 : if (IS_ERR(inode)) {
6150 0 : err = PTR_ERR(inode);
6151 0 : goto out_fail;
6152 : }
6153 :
6154 : drop_on_err = 1;
6155 : /* these must be set before we unlock the inode */
6156 2413 : inode->i_op = &btrfs_dir_inode_operations;
6157 2413 : inode->i_fop = &btrfs_dir_file_operations;
6158 :
6159 2413 : err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
6160 2413 : if (err)
6161 : goto out_fail_inode;
6162 :
6163 : btrfs_i_size_write(inode, 0);
6164 2413 : err = btrfs_update_inode(trans, root, inode);
6165 2413 : if (err)
6166 : goto out_fail_inode;
6167 :
6168 4826 : err = btrfs_add_link(trans, dir, inode, dentry->d_name.name,
6169 2413 : dentry->d_name.len, 0, index);
6170 2413 : if (err)
6171 : goto out_fail_inode;
6172 :
6173 2413 : d_instantiate(dentry, inode);
6174 : /*
6175 : * mkdir is special. We're unlocking after we call d_instantiate
6176 : * to avoid a race with nfsd calling d_instantiate.
6177 : */
6178 2413 : unlock_new_inode(inode);
6179 : drop_on_err = 0;
6180 :
6181 : out_fail:
6182 2413 : btrfs_end_transaction(trans, root);
6183 2413 : if (drop_on_err)
6184 0 : iput(inode);
6185 2413 : btrfs_balance_delayed_items(root);
6186 2413 : btrfs_btree_balance_dirty(root);
6187 2413 : return err;
6188 :
6189 : out_fail_inode:
6190 0 : unlock_new_inode(inode);
6191 0 : goto out_fail;
6192 : }
6193 :
6194 : /* helper for btfs_get_extent. Given an existing extent in the tree,
6195 : * and an extent that you want to insert, deal with overlap and insert
6196 : * the new extent into the tree.
6197 : */
6198 123 : static int merge_extent_mapping(struct extent_map_tree *em_tree,
6199 : struct extent_map *existing,
6200 123 : struct extent_map *em,
6201 : u64 map_start)
6202 : {
6203 : u64 start_diff;
6204 :
6205 246 : BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
6206 123 : start_diff = map_start - em->start;
6207 123 : em->start = map_start;
6208 123 : em->len = existing->start - em->start;
6209 123 : if (em->block_start < EXTENT_MAP_LAST_BYTE &&
6210 : !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
6211 0 : em->block_start += start_diff;
6212 0 : em->block_len -= start_diff;
6213 : }
6214 123 : return add_extent_mapping(em_tree, em, 0);
6215 : }
6216 :
6217 0 : static noinline int uncompress_inline(struct btrfs_path *path,
6218 : struct inode *inode, struct page *page,
6219 : size_t pg_offset, u64 extent_offset,
6220 : struct btrfs_file_extent_item *item)
6221 : {
6222 : int ret;
6223 0 : struct extent_buffer *leaf = path->nodes[0];
6224 : char *tmp;
6225 : size_t max_size;
6226 : unsigned long inline_size;
6227 : unsigned long ptr;
6228 : int compress_type;
6229 :
6230 0 : WARN_ON(pg_offset != 0);
6231 0 : compress_type = btrfs_file_extent_compression(leaf, item);
6232 : max_size = btrfs_file_extent_ram_bytes(leaf, item);
6233 0 : inline_size = btrfs_file_extent_inline_item_len(leaf,
6234 : btrfs_item_nr(path->slots[0]));
6235 : tmp = kmalloc(inline_size, GFP_NOFS);
6236 0 : if (!tmp)
6237 : return -ENOMEM;
6238 : ptr = btrfs_file_extent_inline_start(item);
6239 :
6240 0 : read_extent_buffer(leaf, tmp, ptr, inline_size);
6241 :
6242 0 : max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
6243 0 : ret = btrfs_decompress(compress_type, tmp, page,
6244 : extent_offset, inline_size, max_size);
6245 0 : kfree(tmp);
6246 : return ret;
6247 : }
6248 :
6249 : /*
6250 : * a bit scary, this does extent mapping from logical file offset to the disk.
6251 : * the ugly parts come from merging extents from the disk with the in-ram
6252 : * representation. This gets more complex because of the data=ordered code,
6253 : * where the in-ram extents might be locked pending data=ordered completion.
6254 : *
6255 : * This also copies inline extents directly into the page.
6256 : */
6257 :
6258 1605869 : struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
6259 : size_t pg_offset, u64 start, u64 len,
6260 : int create)
6261 : {
6262 : int ret;
6263 : int err = 0;
6264 : u64 extent_start = 0;
6265 : u64 extent_end = 0;
6266 : u64 objectid = btrfs_ino(inode);
6267 : u32 found_type;
6268 0 : struct btrfs_path *path = NULL;
6269 1605417 : struct btrfs_root *root = BTRFS_I(inode)->root;
6270 : struct btrfs_file_extent_item *item;
6271 9778 : struct extent_buffer *leaf;
6272 : struct btrfs_key found_key;
6273 40793 : struct extent_map *em = NULL;
6274 1605417 : struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
6275 1605417 : struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
6276 : struct btrfs_trans_handle *trans = NULL;
6277 1605417 : const bool new_inline = !page || create;
6278 :
6279 : again:
6280 1605417 : read_lock(&em_tree->lock);
6281 1605436 : em = lookup_extent_mapping(em_tree, start, len);
6282 1605428 : if (em)
6283 1565195 : em->bdev = root->fs_info->fs_devices->latest_bdev;
6284 : read_unlock(&em_tree->lock);
6285 :
6286 1605435 : if (em) {
6287 1565203 : if (em->start > start || em->start + em->len <= start)
6288 153 : free_extent_map(em);
6289 1565050 : else if (em->block_start == EXTENT_MAP_INLINE && page)
6290 2 : free_extent_map(em);
6291 : else
6292 : goto out;
6293 : }
6294 40388 : em = alloc_extent_map();
6295 40385 : if (!em) {
6296 : err = -ENOMEM;
6297 : goto out;
6298 : }
6299 40385 : em->bdev = root->fs_info->fs_devices->latest_bdev;
6300 40385 : em->start = EXTENT_MAP_HOLE;
6301 40385 : em->orig_start = EXTENT_MAP_HOLE;
6302 40385 : em->len = (u64)-1;
6303 40385 : em->block_len = (u64)-1;
6304 :
6305 : if (!path) {
6306 40385 : path = btrfs_alloc_path();
6307 40385 : if (!path) {
6308 : err = -ENOMEM;
6309 : goto out;
6310 : }
6311 : /*
6312 : * Chances are we'll be called again, so go ahead and do
6313 : * readahead
6314 : */
6315 40386 : path->reada = 1;
6316 : }
6317 :
6318 40386 : ret = btrfs_lookup_file_extent(trans, root, path,
6319 : objectid, start, trans != NULL);
6320 40386 : if (ret < 0) {
6321 : err = ret;
6322 : goto out;
6323 : }
6324 :
6325 40387 : if (ret != 0) {
6326 9638 : if (path->slots[0] == 0)
6327 : goto not_found;
6328 9644 : path->slots[0]--;
6329 : }
6330 :
6331 40393 : leaf = path->nodes[0];
6332 80778 : item = btrfs_item_ptr(leaf, path->slots[0],
6333 : struct btrfs_file_extent_item);
6334 : /* are we inside the extent that was found? */
6335 40385 : btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6336 40388 : found_type = btrfs_key_type(&found_key);
6337 40388 : if (found_key.objectid != objectid ||
6338 : found_type != BTRFS_EXTENT_DATA_KEY) {
6339 : /*
6340 : * If we backup past the first extent we want to move forward
6341 : * and see if there is an extent in front of us, otherwise we'll
6342 : * say there is a hole for our whole search range which can
6343 : * cause problems.
6344 : */
6345 : extent_end = start;
6346 : goto next;
6347 : }
6348 :
6349 38441 : found_type = btrfs_file_extent_type(leaf, item);
6350 38441 : extent_start = found_key.offset;
6351 38441 : if (found_type == BTRFS_FILE_EXTENT_REG ||
6352 : found_type == BTRFS_FILE_EXTENT_PREALLOC) {
6353 37945 : extent_end = extent_start +
6354 : btrfs_file_extent_num_bytes(leaf, item);
6355 491 : } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
6356 : size_t size;
6357 491 : size = btrfs_file_extent_inline_len(leaf, path->slots[0], item);
6358 491 : extent_end = ALIGN(extent_start + size, root->sectorsize);
6359 : }
6360 : next:
6361 40527 : if (start >= extent_end) {
6362 9778 : path->slots[0]++;
6363 19556 : if (path->slots[0] >= btrfs_header_nritems(leaf)) {
6364 5247 : ret = btrfs_next_leaf(root, path);
6365 5248 : if (ret < 0) {
6366 : err = ret;
6367 : goto out;
6368 : }
6369 5247 : if (ret > 0)
6370 : goto not_found;
6371 244 : leaf = path->nodes[0];
6372 : }
6373 4775 : btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6374 4932 : if (found_key.objectid != objectid ||
6375 156 : found_key.type != BTRFS_EXTENT_DATA_KEY)
6376 : goto not_found;
6377 156 : if (start + len <= found_key.offset)
6378 : goto not_found;
6379 155 : if (start > found_key.offset)
6380 : goto next;
6381 12 : em->start = start;
6382 12 : em->orig_start = start;
6383 12 : em->len = found_key.offset - start;
6384 12 : goto not_found_em;
6385 : }
6386 :
6387 30749 : btrfs_extent_item_to_extent_map(inode, path, item, new_inline, em);
6388 :
6389 30749 : if (found_type == BTRFS_FILE_EXTENT_REG ||
6390 : found_type == BTRFS_FILE_EXTENT_PREALLOC) {
6391 : goto insert;
6392 491 : } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
6393 : unsigned long ptr;
6394 : char *map;
6395 : size_t size;
6396 : size_t extent_offset;
6397 : size_t copy_size;
6398 :
6399 491 : if (new_inline)
6400 : goto out;
6401 :
6402 452 : size = btrfs_file_extent_inline_len(leaf, path->slots[0], item);
6403 452 : extent_offset = page_offset(page) + pg_offset - extent_start;
6404 452 : copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset,
6405 : size - extent_offset);
6406 452 : em->start = extent_start + extent_offset;
6407 452 : em->len = ALIGN(copy_size, root->sectorsize);
6408 452 : em->orig_block_len = em->len;
6409 452 : em->orig_start = em->start;
6410 452 : ptr = btrfs_file_extent_inline_start(item) + extent_offset;
6411 904 : if (create == 0 && !PageUptodate(page)) {
6412 452 : if (btrfs_file_extent_compression(leaf, item) !=
6413 : BTRFS_COMPRESS_NONE) {
6414 0 : ret = uncompress_inline(path, inode, page,
6415 : pg_offset,
6416 : extent_offset, item);
6417 0 : if (ret) {
6418 : err = ret;
6419 : goto out;
6420 : }
6421 : } else {
6422 : map = kmap(page);
6423 452 : read_extent_buffer(leaf, map + pg_offset, ptr,
6424 : copy_size);
6425 452 : if (pg_offset + copy_size < PAGE_CACHE_SIZE) {
6426 452 : memset(map + pg_offset + copy_size, 0,
6427 452 : PAGE_CACHE_SIZE - pg_offset -
6428 : copy_size);
6429 : }
6430 : kunmap(page);
6431 : }
6432 : flush_dcache_page(page);
6433 0 : } else if (create && PageUptodate(page)) {
6434 0 : BUG();
6435 : if (!trans) {
6436 : kunmap(page);
6437 : free_extent_map(em);
6438 : em = NULL;
6439 :
6440 : btrfs_release_path(path);
6441 : trans = btrfs_join_transaction(root);
6442 :
6443 : if (IS_ERR(trans))
6444 : return ERR_CAST(trans);
6445 : goto again;
6446 : }
6447 : map = kmap(page);
6448 : write_extent_buffer(leaf, map + pg_offset, ptr,
6449 : copy_size);
6450 : kunmap(page);
6451 : btrfs_mark_buffer_dirty(leaf);
6452 : }
6453 452 : set_extent_uptodate(io_tree, em->start,
6454 : extent_map_end(em) - 1, NULL, GFP_NOFS);
6455 452 : goto insert;
6456 : }
6457 : not_found:
6458 9618 : em->start = start;
6459 9618 : em->orig_start = start;
6460 9618 : em->len = len;
6461 : not_found_em:
6462 9630 : em->block_start = EXTENT_MAP_HOLE;
6463 : set_bit(EXTENT_FLAG_VACANCY, &em->flags);
6464 : insert:
6465 40350 : btrfs_release_path(path);
6466 80685 : if (em->start > start || extent_map_end(em) <= start) {
6467 3 : btrfs_err(root->fs_info, "bad extent! em: [%llu %llu] passed [%llu %llu]",
6468 : em->start, em->len, start, len);
6469 : err = -EIO;
6470 0 : goto out;
6471 : }
6472 :
6473 : err = 0;
6474 40341 : write_lock(&em_tree->lock);
6475 40350 : ret = add_extent_mapping(em_tree, em, 0);
6476 : /* it is possible that someone inserted the extent into the tree
6477 : * while we had the lock dropped. It is also possible that
6478 : * an overlapping map exists in the tree
6479 : */
6480 40347 : if (ret == -EEXIST) {
6481 : struct extent_map *existing;
6482 :
6483 : ret = 0;
6484 :
6485 125 : existing = lookup_extent_mapping(em_tree, start, len);
6486 127 : if (existing && (existing->start > start ||
6487 2 : existing->start + existing->len <= start)) {
6488 123 : free_extent_map(existing);
6489 : existing = NULL;
6490 : }
6491 125 : if (!existing) {
6492 123 : existing = lookup_extent_mapping(em_tree, em->start,
6493 : em->len);
6494 123 : if (existing) {
6495 123 : err = merge_extent_mapping(em_tree, existing,
6496 : em, start);
6497 123 : free_extent_map(existing);
6498 123 : if (err) {
6499 0 : free_extent_map(em);
6500 : em = NULL;
6501 : }
6502 : } else {
6503 : err = -EIO;
6504 0 : free_extent_map(em);
6505 : em = NULL;
6506 : }
6507 : } else {
6508 2 : free_extent_map(em);
6509 : em = existing;
6510 : err = 0;
6511 : }
6512 : }
6513 : write_unlock(&em_tree->lock);
6514 : out:
6515 :
6516 1605432 : trace_btrfs_get_extent(root, em);
6517 :
6518 1605429 : if (path)
6519 40384 : btrfs_free_path(path);
6520 : if (trans) {
6521 : ret = btrfs_end_transaction(trans, root);
6522 : if (!err)
6523 : err = ret;
6524 : }
6525 1605428 : if (err) {
6526 0 : free_extent_map(em);
6527 0 : return ERR_PTR(err);
6528 : }
6529 1605428 : BUG_ON(!em); /* Error is always set */
6530 : return em;
6531 : }
6532 :
6533 1891 : struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
6534 : size_t pg_offset, u64 start, u64 len,
6535 : int create)
6536 : {
6537 : struct extent_map *em;
6538 73 : struct extent_map *hole_em = NULL;
6539 1891 : u64 range_start = start;
6540 : u64 end;
6541 : u64 found;
6542 : u64 found_end;
6543 : int err = 0;
6544 :
6545 1891 : em = btrfs_get_extent(inode, page, pg_offset, start, len, create);
6546 1891 : if (IS_ERR(em))
6547 : return em;
6548 1891 : if (em) {
6549 : /*
6550 : * if our em maps to
6551 : * - a hole or
6552 : * - a pre-alloc extent,
6553 : * there might actually be delalloc bytes behind it.
6554 : */
6555 3279 : if (em->block_start != EXTENT_MAP_HOLE &&
6556 : !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
6557 : return em;
6558 : else
6559 : hole_em = em;
6560 : }
6561 :
6562 : /* check to see if we've wrapped (len == -1 or similar) */
6563 688 : end = start + len;
6564 688 : if (end < start)
6565 : end = (u64)-1;
6566 : else
6567 688 : end -= 1;
6568 :
6569 : em = NULL;
6570 :
6571 : /* ok, we didn't find anything, lets look for delalloc */
6572 688 : found = count_range_bits(&BTRFS_I(inode)->io_tree, &range_start,
6573 : end, len, EXTENT_DELALLOC, 1);
6574 688 : found_end = range_start + found;
6575 688 : if (found_end < range_start)
6576 : found_end = (u64)-1;
6577 :
6578 : /*
6579 : * we didn't find anything useful, return
6580 : * the original results from get_extent()
6581 : */
6582 688 : if (range_start > end || found_end <= start) {
6583 : em = hole_em;
6584 : hole_em = NULL;
6585 : goto out;
6586 : }
6587 :
6588 : /* adjust the range_start to make sure it doesn't
6589 : * go backwards from the start they passed in
6590 : */
6591 73 : range_start = max(start, range_start);
6592 73 : found = found_end - range_start;
6593 :
6594 73 : if (found > 0) {
6595 : u64 hole_start = start;
6596 : u64 hole_len = len;
6597 :
6598 73 : em = alloc_extent_map();
6599 73 : if (!em) {
6600 : err = -ENOMEM;
6601 : goto out;
6602 : }
6603 : /*
6604 : * when btrfs_get_extent can't find anything it
6605 : * returns one huge hole
6606 : *
6607 : * make sure what it found really fits our range, and
6608 : * adjust to make sure it is based on the start from
6609 : * the caller
6610 : */
6611 73 : if (hole_em) {
6612 : u64 calc_end = extent_map_end(hole_em);
6613 :
6614 73 : if (calc_end <= start || (hole_em->start > end)) {
6615 0 : free_extent_map(hole_em);
6616 0 : hole_em = NULL;
6617 : } else {
6618 73 : hole_start = max(hole_em->start, start);
6619 73 : hole_len = calc_end - hole_start;
6620 : }
6621 : }
6622 73 : em->bdev = NULL;
6623 73 : if (hole_em && range_start > hole_start) {
6624 : /* our hole starts before our delalloc, so we
6625 : * have to return just the parts of the hole
6626 : * that go until the delalloc starts
6627 : */
6628 26 : em->len = min(hole_len,
6629 : range_start - hole_start);
6630 26 : em->start = hole_start;
6631 26 : em->orig_start = hole_start;
6632 : /*
6633 : * don't adjust block start at all,
6634 : * it is fixed at EXTENT_MAP_HOLE
6635 : */
6636 26 : em->block_start = hole_em->block_start;
6637 26 : em->block_len = hole_len;
6638 26 : if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags))
6639 : set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
6640 : } else {
6641 47 : em->start = range_start;
6642 47 : em->len = found;
6643 47 : em->orig_start = range_start;
6644 47 : em->block_start = EXTENT_MAP_DELALLOC;
6645 47 : em->block_len = found;
6646 : }
6647 0 : } else if (hole_em) {
6648 : return hole_em;
6649 : }
6650 : out:
6651 :
6652 688 : free_extent_map(hole_em);
6653 688 : if (err) {
6654 0 : free_extent_map(em);
6655 0 : return ERR_PTR(err);
6656 : }
6657 : return em;
6658 : }
6659 :
6660 25259 : static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
6661 : u64 start, u64 len)
6662 : {
6663 25259 : struct btrfs_root *root = BTRFS_I(inode)->root;
6664 : struct extent_map *em;
6665 : struct btrfs_key ins;
6666 : u64 alloc_hint;
6667 : int ret;
6668 :
6669 25259 : alloc_hint = get_extent_allocation_hint(inode, start, len);
6670 25260 : ret = btrfs_reserve_extent(root, len, root->sectorsize, 0,
6671 : alloc_hint, &ins, 1, 1);
6672 25259 : if (ret)
6673 0 : return ERR_PTR(ret);
6674 :
6675 25259 : em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
6676 : ins.offset, ins.offset, ins.offset, 0);
6677 25254 : if (IS_ERR(em)) {
6678 0 : btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
6679 0 : return em;
6680 : }
6681 :
6682 25254 : ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
6683 : ins.offset, ins.offset, 0);
6684 25259 : if (ret) {
6685 0 : btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
6686 0 : free_extent_map(em);
6687 0 : return ERR_PTR(ret);
6688 : }
6689 :
6690 : return em;
6691 : }
6692 :
6693 : /*
6694 : * returns 1 when the nocow is safe, < 1 on error, 0 if the
6695 : * block must be cow'd
6696 : */
6697 0 : noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
6698 : u64 *orig_start, u64 *orig_block_len,
6699 : u64 *ram_bytes)
6700 : {
6701 : struct btrfs_trans_handle *trans;
6702 : struct btrfs_path *path;
6703 : int ret;
6704 : struct extent_buffer *leaf;
6705 0 : struct btrfs_root *root = BTRFS_I(inode)->root;
6706 0 : struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
6707 : struct btrfs_file_extent_item *fi;
6708 : struct btrfs_key key;
6709 : u64 disk_bytenr;
6710 : u64 backref_offset;
6711 : u64 extent_end;
6712 : u64 num_bytes;
6713 : int slot;
6714 : int found_type;
6715 0 : bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW);
6716 :
6717 0 : path = btrfs_alloc_path();
6718 0 : if (!path)
6719 : return -ENOMEM;
6720 :
6721 0 : ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
6722 : offset, 0);
6723 0 : if (ret < 0)
6724 : goto out;
6725 :
6726 0 : slot = path->slots[0];
6727 0 : if (ret == 1) {
6728 0 : if (slot == 0) {
6729 : /* can't find the item, must cow */
6730 : ret = 0;
6731 : goto out;
6732 : }
6733 0 : slot--;
6734 : }
6735 : ret = 0;
6736 0 : leaf = path->nodes[0];
6737 0 : btrfs_item_key_to_cpu(leaf, &key, slot);
6738 0 : if (key.objectid != btrfs_ino(inode) ||
6739 0 : key.type != BTRFS_EXTENT_DATA_KEY) {
6740 : /* not our file or wrong item type, must cow */
6741 : goto out;
6742 : }
6743 :
6744 0 : if (key.offset > offset) {
6745 : /* Wrong offset, must cow */
6746 : goto out;
6747 : }
6748 :
6749 0 : fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
6750 0 : found_type = btrfs_file_extent_type(leaf, fi);
6751 0 : if (found_type != BTRFS_FILE_EXTENT_REG &&
6752 : found_type != BTRFS_FILE_EXTENT_PREALLOC) {
6753 : /* not a regular extent, must cow */
6754 : goto out;
6755 : }
6756 :
6757 0 : if (!nocow && found_type == BTRFS_FILE_EXTENT_REG)
6758 : goto out;
6759 :
6760 0 : extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
6761 0 : if (extent_end <= offset)
6762 : goto out;
6763 :
6764 : disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
6765 0 : if (disk_bytenr == 0)
6766 : goto out;
6767 :
6768 0 : if (btrfs_file_extent_compression(leaf, fi) ||
6769 0 : btrfs_file_extent_encryption(leaf, fi) ||
6770 : btrfs_file_extent_other_encoding(leaf, fi))
6771 : goto out;
6772 :
6773 : backref_offset = btrfs_file_extent_offset(leaf, fi);
6774 :
6775 0 : if (orig_start) {
6776 0 : *orig_start = key.offset - backref_offset;
6777 0 : *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
6778 0 : *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
6779 : }
6780 :
6781 0 : if (btrfs_extent_readonly(root, disk_bytenr))
6782 : goto out;
6783 :
6784 0 : num_bytes = min(offset + *len, extent_end) - offset;
6785 0 : if (!nocow && found_type == BTRFS_FILE_EXTENT_PREALLOC) {
6786 : u64 range_end;
6787 :
6788 0 : range_end = round_up(offset + num_bytes, root->sectorsize) - 1;
6789 0 : ret = test_range_bit(io_tree, offset, range_end,
6790 : EXTENT_DELALLOC, 0, NULL);
6791 0 : if (ret) {
6792 : ret = -EAGAIN;
6793 : goto out;
6794 : }
6795 : }
6796 :
6797 0 : btrfs_release_path(path);
6798 :
6799 : /*
6800 : * look for other files referencing this extent, if we
6801 : * find any we must cow
6802 : */
6803 0 : trans = btrfs_join_transaction(root);
6804 0 : if (IS_ERR(trans)) {
6805 : ret = 0;
6806 : goto out;
6807 : }
6808 :
6809 0 : ret = btrfs_cross_ref_exist(trans, root, btrfs_ino(inode),
6810 0 : key.offset - backref_offset, disk_bytenr);
6811 0 : btrfs_end_transaction(trans, root);
6812 0 : if (ret) {
6813 : ret = 0;
6814 : goto out;
6815 : }
6816 :
6817 : /*
6818 : * adjust disk_bytenr and num_bytes to cover just the bytes
6819 : * in this extent we are about to write. If there
6820 : * are any csums in that range we have to cow in order
6821 : * to keep the csums correct
6822 : */
6823 0 : disk_bytenr += backref_offset;
6824 0 : disk_bytenr += offset - key.offset;
6825 0 : if (csum_exist_in_range(root, disk_bytenr, num_bytes))
6826 : goto out;
6827 : /*
6828 : * all of the above have passed, it is safe to overwrite this extent
6829 : * without cow
6830 : */
6831 0 : *len = num_bytes;
6832 : ret = 1;
6833 : out:
6834 0 : btrfs_free_path(path);
6835 0 : return ret;
6836 : }
6837 :
6838 25365 : bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end)
6839 : {
6840 25365 : struct radix_tree_root *root = &inode->i_mapping->page_tree;
6841 : int found = false;
6842 25365 : void **pagep = NULL;
6843 : struct page *page = NULL;
6844 : int start_idx;
6845 : int end_idx;
6846 :
6847 25365 : start_idx = start >> PAGE_CACHE_SHIFT;
6848 :
6849 : /*
6850 : * end is the last byte in the last page. end == start is legal
6851 : */
6852 25365 : end_idx = end >> PAGE_CACHE_SHIFT;
6853 :
6854 : rcu_read_lock();
6855 :
6856 : /* Most of the code in this while loop is lifted from
6857 : * find_get_page. It's been modified to begin searching from a
6858 : * page and return just the first page found in that range. If the
6859 : * found idx is less than or equal to the end idx then we know that
6860 : * a page exists. If no pages are found or if those pages are
6861 : * outside of the range then we're fine (yay!) */
6862 50764 : while (page == NULL &&
6863 25364 : radix_tree_gang_lookup_slot(root, &pagep, NULL, start_idx, 1)) {
6864 34 : page = radix_tree_deref_slot(pagep);
6865 34 : if (unlikely(!page))
6866 : break;
6867 :
6868 34 : if (radix_tree_exception(page)) {
6869 0 : if (radix_tree_deref_retry(page)) {
6870 : page = NULL;
6871 0 : continue;
6872 : }
6873 : /*
6874 : * Otherwise, shmem/tmpfs must be storing a swap entry
6875 : * here as an exceptional entry: so return it without
6876 : * attempting to raise page count.
6877 : */
6878 : page = NULL;
6879 : break; /* TODO: Is this relevant for this use case? */
6880 : }
6881 :
6882 34 : if (!page_cache_get_speculative(page)) {
6883 : page = NULL;
6884 0 : continue;
6885 : }
6886 :
6887 : /*
6888 : * Has the page moved?
6889 : * This is part of the lockless pagecache protocol. See
6890 : * include/linux/pagemap.h for details.
6891 : */
6892 34 : if (unlikely(page != *pagep)) {
6893 0 : page_cache_release(page);
6894 : page = NULL;
6895 : }
6896 : }
6897 :
6898 25366 : if (page) {
6899 34 : if (page->index <= end_idx)
6900 : found = true;
6901 34 : page_cache_release(page);
6902 : }
6903 :
6904 : rcu_read_unlock();
6905 25365 : return found;
6906 : }
6907 :
6908 25259 : static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
6909 : struct extent_state **cached_state, int writing)
6910 : {
6911 : struct btrfs_ordered_extent *ordered;
6912 : int ret = 0;
6913 :
6914 : while (1) {
6915 25259 : lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
6916 : 0, cached_state);
6917 : /*
6918 : * We're concerned with the entire range that we're going to be
6919 : * doing DIO to, so we need to make sure theres no ordered
6920 : * extents in this range.
6921 : */
6922 25257 : ordered = btrfs_lookup_ordered_range(inode, lockstart,
6923 25257 : lockend - lockstart + 1);
6924 :
6925 : /*
6926 : * We need to make sure there are no buffered pages in this
6927 : * range either, we could have raced between the invalidate in
6928 : * generic_file_direct_write and locking the extent. The
6929 : * invalidate needs to happen so that reads after a write do not
6930 : * get stale data.
6931 : */
6932 25257 : if (!ordered &&
6933 25252 : (!writing ||
6934 25256 : !btrfs_page_exists_in_range(inode, lockstart, lockend)))
6935 : break;
6936 :
6937 0 : unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
6938 : cached_state, GFP_NOFS);
6939 :
6940 0 : if (ordered) {
6941 0 : btrfs_start_ordered_extent(inode, ordered, 1);
6942 0 : btrfs_put_ordered_extent(ordered);
6943 : } else {
6944 : /* Screw you mmap */
6945 0 : ret = filemap_write_and_wait_range(inode->i_mapping,
6946 : lockstart,
6947 : lockend);
6948 0 : if (ret)
6949 : break;
6950 :
6951 : /*
6952 : * If we found a page that couldn't be invalidated just
6953 : * fall back to buffered.
6954 : */
6955 0 : ret = invalidate_inode_pages2_range(inode->i_mapping,
6956 0 : lockstart >> PAGE_CACHE_SHIFT,
6957 0 : lockend >> PAGE_CACHE_SHIFT);
6958 0 : if (ret)
6959 : break;
6960 : }
6961 :
6962 0 : cond_resched();
6963 0 : }
6964 :
6965 25254 : return ret;
6966 : }
6967 :
6968 25256 : static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
6969 : u64 len, u64 orig_start,
6970 : u64 block_start, u64 block_len,
6971 : u64 orig_block_len, u64 ram_bytes,
6972 : int type)
6973 : {
6974 : struct extent_map_tree *em_tree;
6975 : struct extent_map *em;
6976 25256 : struct btrfs_root *root = BTRFS_I(inode)->root;
6977 : int ret;
6978 :
6979 25256 : em_tree = &BTRFS_I(inode)->extent_tree;
6980 25256 : em = alloc_extent_map();
6981 25257 : if (!em)
6982 : return ERR_PTR(-ENOMEM);
6983 :
6984 25257 : em->start = start;
6985 25257 : em->orig_start = orig_start;
6986 25257 : em->mod_start = start;
6987 25257 : em->mod_len = len;
6988 25257 : em->len = len;
6989 25257 : em->block_len = block_len;
6990 25257 : em->block_start = block_start;
6991 25257 : em->bdev = root->fs_info->fs_devices->latest_bdev;
6992 25257 : em->orig_block_len = orig_block_len;
6993 25257 : em->ram_bytes = ram_bytes;
6994 25257 : em->generation = -1;
6995 : set_bit(EXTENT_FLAG_PINNED, &em->flags);
6996 25259 : if (type == BTRFS_ORDERED_PREALLOC)
6997 : set_bit(EXTENT_FLAG_FILLING, &em->flags);
6998 :
6999 : do {
7000 25254 : btrfs_drop_extent_cache(inode, em->start,
7001 25254 : em->start + em->len - 1, 0);
7002 25256 : write_lock(&em_tree->lock);
7003 25259 : ret = add_extent_mapping(em_tree, em, 1);
7004 : write_unlock(&em_tree->lock);
7005 25255 : } while (ret == -EEXIST);
7006 :
7007 25255 : if (ret) {
7008 0 : free_extent_map(em);
7009 0 : return ERR_PTR(ret);
7010 : }
7011 :
7012 : return em;
7013 : }
7014 :
7015 :
7016 50518 : static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
7017 : struct buffer_head *bh_result, int create)
7018 : {
7019 : struct extent_map *em;
7020 25258 : struct btrfs_root *root = BTRFS_I(inode)->root;
7021 25258 : struct extent_state *cached_state = NULL;
7022 25258 : u64 start = iblock << inode->i_blkbits;
7023 : u64 lockstart, lockend;
7024 25258 : u64 len = bh_result->b_size;
7025 : int unlock_bits = EXTENT_LOCKED;
7026 : int ret = 0;
7027 :
7028 25258 : if (create)
7029 : unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY;
7030 : else
7031 0 : len = min_t(u64, len, root->sectorsize);
7032 :
7033 : lockstart = start;
7034 25258 : lockend = start + len - 1;
7035 :
7036 : /*
7037 : * If this errors out it's because we couldn't invalidate pagecache for
7038 : * this range and we need to fallback to buffered.
7039 : */
7040 25258 : if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create))
7041 : return -ENOTBLK;
7042 :
7043 25256 : em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
7044 25253 : if (IS_ERR(em)) {
7045 0 : ret = PTR_ERR(em);
7046 0 : goto unlock_err;
7047 : }
7048 :
7049 : /*
7050 : * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
7051 : * io. INLINE is special, and we could probably kludge it in here, but
7052 : * it's still buffered so for safety lets just fall back to the generic
7053 : * buffered path.
7054 : *
7055 : * For COMPRESSED we _have_ to read the entire extent in so we can
7056 : * decompress it, so there will be buffering required no matter what we
7057 : * do, so go ahead and fallback to buffered.
7058 : *
7059 : * We return -ENOTBLK because thats what makes DIO go ahead and go back
7060 : * to buffered IO. Don't blame me, this is the price we pay for using
7061 : * the generic code.
7062 : */
7063 50507 : if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
7064 25254 : em->block_start == EXTENT_MAP_INLINE) {
7065 0 : free_extent_map(em);
7066 : ret = -ENOTBLK;
7067 0 : goto unlock_err;
7068 : }
7069 :
7070 : /* Just a good old fashioned hole, return */
7071 25254 : if (!create && (em->block_start == EXTENT_MAP_HOLE ||
7072 : test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
7073 0 : free_extent_map(em);
7074 0 : goto unlock_err;
7075 : }
7076 :
7077 : /*
7078 : * We don't allocate a new extent in the following cases
7079 : *
7080 : * 1) The inode is marked as NODATACOW. In this case we'll just use the
7081 : * existing extent.
7082 : * 2) The extent is marked as PREALLOC. We're good to go here and can
7083 : * just use the extent.
7084 : *
7085 : */
7086 25254 : if (!create) {
7087 0 : len = min(len, em->len - (start - em->start));
7088 0 : lockstart = start + len;
7089 0 : goto unlock;
7090 : }
7091 :
7092 50507 : if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
7093 25253 : ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
7094 : em->block_start != EXTENT_MAP_HOLE)) {
7095 : int type;
7096 : int ret;
7097 : u64 block_start, orig_start, orig_block_len, ram_bytes;
7098 :
7099 0 : if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
7100 : type = BTRFS_ORDERED_PREALLOC;
7101 : else
7102 : type = BTRFS_ORDERED_NOCOW;
7103 0 : len = min(len, em->len - (start - em->start));
7104 0 : block_start = em->block_start + (start - em->start);
7105 :
7106 0 : if (can_nocow_extent(inode, start, &len, &orig_start,
7107 : &orig_block_len, &ram_bytes) == 1) {
7108 0 : if (type == BTRFS_ORDERED_PREALLOC) {
7109 0 : free_extent_map(em);
7110 0 : em = create_pinned_em(inode, start, len,
7111 : orig_start,
7112 : block_start, len,
7113 : orig_block_len,
7114 : ram_bytes, type);
7115 0 : if (IS_ERR(em))
7116 : goto unlock_err;
7117 : }
7118 :
7119 0 : ret = btrfs_add_ordered_extent_dio(inode, start,
7120 : block_start, len, len, type);
7121 0 : if (ret) {
7122 0 : free_extent_map(em);
7123 0 : goto unlock_err;
7124 : }
7125 0 : goto unlock;
7126 : }
7127 : }
7128 :
7129 : /*
7130 : * this will cow the extent, reset the len in case we changed
7131 : * it above
7132 : */
7133 25254 : len = bh_result->b_size;
7134 25254 : free_extent_map(em);
7135 25260 : em = btrfs_new_extent_direct(inode, start, len);
7136 25259 : if (IS_ERR(em)) {
7137 0 : ret = PTR_ERR(em);
7138 0 : goto unlock_err;
7139 : }
7140 25259 : len = min(len, em->len - (start - em->start));
7141 : unlock:
7142 50518 : bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
7143 25259 : inode->i_blkbits;
7144 25259 : bh_result->b_size = len;
7145 25259 : bh_result->b_bdev = em->bdev;
7146 : set_buffer_mapped(bh_result);
7147 25259 : if (create) {
7148 25260 : if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
7149 : set_buffer_new(bh_result);
7150 :
7151 : /*
7152 : * Need to update the i_size under the extent lock so buffered
7153 : * readers will get the updated i_size when we unlock.
7154 : */
7155 50520 : if (start + len > i_size_read(inode))
7156 3813 : i_size_write(inode, start + len);
7157 :
7158 : spin_lock(&BTRFS_I(inode)->lock);
7159 25260 : BTRFS_I(inode)->outstanding_extents++;
7160 : spin_unlock(&BTRFS_I(inode)->lock);
7161 :
7162 25259 : ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
7163 25259 : lockstart + len - 1, EXTENT_DELALLOC, NULL,
7164 : &cached_state, GFP_NOFS);
7165 25259 : BUG_ON(ret);
7166 : }
7167 :
7168 : /*
7169 : * In the case of write we need to clear and unlock the entire range,
7170 : * in the case of read we need to unlock only the end area that we
7171 : * aren't using if there is any left over space.
7172 : */
7173 25258 : if (lockstart < lockend) {
7174 25258 : clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
7175 : lockend, unlock_bits, 1, 0,
7176 : &cached_state, GFP_NOFS);
7177 : } else {
7178 0 : free_extent_state(cached_state);
7179 : }
7180 :
7181 25257 : free_extent_map(em);
7182 :
7183 25260 : return 0;
7184 :
7185 : unlock_err:
7186 0 : clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
7187 : unlock_bits, 1, 0, &cached_state, GFP_NOFS);
7188 0 : return ret;
7189 : }
7190 :
7191 0 : static void btrfs_endio_direct_read(struct bio *bio, int err)
7192 : {
7193 0 : struct btrfs_dio_private *dip = bio->bi_private;
7194 : struct bio_vec *bvec;
7195 0 : struct inode *inode = dip->inode;
7196 0 : struct btrfs_root *root = BTRFS_I(inode)->root;
7197 : struct bio *dio_bio;
7198 0 : u32 *csums = (u32 *)dip->csum;
7199 : u64 start;
7200 : int i;
7201 :
7202 0 : start = dip->logical_offset;
7203 0 : bio_for_each_segment_all(bvec, bio, i) {
7204 0 : if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
7205 0 : struct page *page = bvec->bv_page;
7206 : char *kaddr;
7207 0 : u32 csum = ~(u32)0;
7208 : unsigned long flags;
7209 :
7210 0 : local_irq_save(flags);
7211 : kaddr = kmap_atomic(page);
7212 0 : csum = btrfs_csum_data(kaddr + bvec->bv_offset,
7213 0 : csum, bvec->bv_len);
7214 0 : btrfs_csum_final(csum, (char *)&csum);
7215 : kunmap_atomic(kaddr);
7216 0 : local_irq_restore(flags);
7217 :
7218 : flush_dcache_page(bvec->bv_page);
7219 0 : if (csum != csums[i]) {
7220 0 : btrfs_err(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u",
7221 : btrfs_ino(inode), start, csum,
7222 : csums[i]);
7223 : err = -EIO;
7224 : }
7225 : }
7226 :
7227 0 : start += bvec->bv_len;
7228 : }
7229 :
7230 0 : unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
7231 0 : dip->logical_offset + dip->bytes - 1);
7232 0 : dio_bio = dip->dio_bio;
7233 :
7234 0 : kfree(dip);
7235 :
7236 : /* If we had a csum failure make sure to clear the uptodate flag */
7237 0 : if (err)
7238 : clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
7239 0 : dio_end_io(dio_bio, err);
7240 0 : bio_put(bio);
7241 0 : }
7242 :
7243 25258 : static void btrfs_endio_direct_write(struct bio *bio, int err)
7244 : {
7245 25258 : struct btrfs_dio_private *dip = bio->bi_private;
7246 25258 : struct inode *inode = dip->inode;
7247 25258 : struct btrfs_root *root = BTRFS_I(inode)->root;
7248 25258 : struct btrfs_ordered_extent *ordered = NULL;
7249 25258 : u64 ordered_offset = dip->logical_offset;
7250 25258 : u64 ordered_bytes = dip->bytes;
7251 : struct bio *dio_bio;
7252 : int ret;
7253 :
7254 25258 : if (err)
7255 : goto out_done;
7256 : again:
7257 25260 : ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
7258 : &ordered_offset,
7259 : ordered_bytes, !err);
7260 25260 : if (!ret)
7261 : goto out_test;
7262 :
7263 25260 : btrfs_init_work(&ordered->work, btrfs_endio_write_helper,
7264 : finish_ordered_fn, NULL, NULL);
7265 25256 : btrfs_queue_work(root->fs_info->endio_write_workers,
7266 25256 : &ordered->work);
7267 : out_test:
7268 : /*
7269 : * our bio might span multiple ordered extents. If we haven't
7270 : * completed the accounting for the whole dio, go back and try again
7271 : */
7272 25260 : if (ordered_offset < dip->logical_offset + dip->bytes) {
7273 0 : ordered_bytes = dip->logical_offset + dip->bytes -
7274 : ordered_offset;
7275 0 : ordered = NULL;
7276 0 : goto again;
7277 : }
7278 : out_done:
7279 25258 : dio_bio = dip->dio_bio;
7280 :
7281 25258 : kfree(dip);
7282 :
7283 : /* If we had an error make sure to clear the uptodate flag */
7284 25260 : if (err)
7285 : clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
7286 25260 : dio_end_io(dio_bio, err);
7287 25260 : bio_put(bio);
7288 25255 : }
7289 :
7290 0 : static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
7291 : struct bio *bio, int mirror_num,
7292 : unsigned long bio_flags, u64 offset)
7293 : {
7294 : int ret;
7295 0 : struct btrfs_root *root = BTRFS_I(inode)->root;
7296 0 : ret = btrfs_csum_one_bio(root, inode, bio, offset, 1);
7297 0 : BUG_ON(ret); /* -ENOMEM */
7298 0 : return 0;
7299 : }
7300 :
7301 0 : static void btrfs_end_dio_bio(struct bio *bio, int err)
7302 : {
7303 0 : struct btrfs_dio_private *dip = bio->bi_private;
7304 :
7305 0 : if (err) {
7306 0 : btrfs_err(BTRFS_I(dip->inode)->root->fs_info,
7307 : "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d",
7308 : btrfs_ino(dip->inode), bio->bi_rw,
7309 : (unsigned long long)bio->bi_iter.bi_sector,
7310 : bio->bi_iter.bi_size, err);
7311 0 : dip->errors = 1;
7312 :
7313 : /*
7314 : * before atomic variable goto zero, we must make sure
7315 : * dip->errors is perceived to be set.
7316 : */
7317 0 : smp_mb__before_atomic();
7318 : }
7319 :
7320 : /* if there are more bios still pending for this dio, just exit */
7321 0 : if (!atomic_dec_and_test(&dip->pending_bios))
7322 : goto out;
7323 :
7324 0 : if (dip->errors) {
7325 0 : bio_io_error(dip->orig_bio);
7326 : } else {
7327 0 : set_bit(BIO_UPTODATE, &dip->dio_bio->bi_flags);
7328 0 : bio_endio(dip->orig_bio, 0);
7329 : }
7330 : out:
7331 0 : bio_put(bio);
7332 0 : }
7333 :
7334 0 : static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
7335 : u64 first_sector, gfp_t gfp_flags)
7336 : {
7337 0 : int nr_vecs = bio_get_nr_vecs(bdev);
7338 0 : return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags);
7339 : }
7340 :
7341 25255 : static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
7342 : int rw, u64 file_offset, int skip_sum,
7343 : int async_submit)
7344 : {
7345 25255 : struct btrfs_dio_private *dip = bio->bi_private;
7346 25255 : int write = rw & REQ_WRITE;
7347 25255 : struct btrfs_root *root = BTRFS_I(inode)->root;
7348 : int ret;
7349 :
7350 25255 : if (async_submit)
7351 0 : async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers);
7352 :
7353 25255 : bio_get(bio);
7354 :
7355 25260 : if (!write) {
7356 0 : ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
7357 0 : if (ret)
7358 : goto err;
7359 : }
7360 :
7361 25255 : if (skip_sum)
7362 : goto map;
7363 :
7364 25255 : if (write && async_submit) {
7365 0 : ret = btrfs_wq_submit_bio(root->fs_info,
7366 : inode, rw, bio, 0, 0,
7367 : file_offset,
7368 : __btrfs_submit_bio_start_direct_io,
7369 : __btrfs_submit_bio_done);
7370 0 : goto err;
7371 25255 : } else if (write) {
7372 : /*
7373 : * If we aren't doing async submit, calculate the csum of the
7374 : * bio now.
7375 : */
7376 25255 : ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1);
7377 25258 : if (ret)
7378 : goto err;
7379 0 : } else if (!skip_sum) {
7380 0 : ret = btrfs_lookup_bio_sums_dio(root, inode, dip, bio,
7381 : file_offset);
7382 0 : if (ret)
7383 : goto err;
7384 : }
7385 :
7386 : map:
7387 25258 : ret = btrfs_map_bio(root, rw, bio, 0, async_submit);
7388 : err:
7389 25264 : bio_put(bio);
7390 25260 : return ret;
7391 : }
7392 :
7393 25258 : static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
7394 : int skip_sum)
7395 : {
7396 25258 : struct inode *inode = dip->inode;
7397 25258 : struct btrfs_root *root = BTRFS_I(inode)->root;
7398 : struct bio *bio;
7399 25258 : struct bio *orig_bio = dip->orig_bio;
7400 25258 : struct bio_vec *bvec = orig_bio->bi_io_vec;
7401 25258 : u64 start_sector = orig_bio->bi_iter.bi_sector;
7402 25258 : u64 file_offset = dip->logical_offset;
7403 : u64 submit_len = 0;
7404 : u64 map_length;
7405 : int nr_pages = 0;
7406 : int ret = 0;
7407 : int async_submit = 0;
7408 :
7409 25258 : map_length = orig_bio->bi_iter.bi_size;
7410 25258 : ret = btrfs_map_block(root->fs_info, rw, start_sector << 9,
7411 : &map_length, NULL, 0);
7412 25259 : if (ret)
7413 : return -EIO;
7414 :
7415 25259 : if (map_length >= orig_bio->bi_iter.bi_size) {
7416 : bio = orig_bio;
7417 : goto submit;
7418 : }
7419 :
7420 : /* async crcs make it difficult to collect full stripe writes. */
7421 0 : if (btrfs_get_alloc_profile(root, 1) &
7422 : (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6))
7423 : async_submit = 0;
7424 : else
7425 : async_submit = 1;
7426 :
7427 0 : bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
7428 0 : if (!bio)
7429 : return -ENOMEM;
7430 :
7431 0 : bio->bi_private = dip;
7432 0 : bio->bi_end_io = btrfs_end_dio_bio;
7433 0 : atomic_inc(&dip->pending_bios);
7434 :
7435 0 : while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) {
7436 0 : if (unlikely(map_length < submit_len + bvec->bv_len ||
7437 : bio_add_page(bio, bvec->bv_page, bvec->bv_len,
7438 : bvec->bv_offset) < bvec->bv_len)) {
7439 : /*
7440 : * inc the count before we submit the bio so
7441 : * we know the end IO handler won't happen before
7442 : * we inc the count. Otherwise, the dip might get freed
7443 : * before we're done setting it up
7444 : */
7445 : atomic_inc(&dip->pending_bios);
7446 0 : ret = __btrfs_submit_dio_bio(bio, inode, rw,
7447 : file_offset, skip_sum,
7448 : async_submit);
7449 0 : if (ret) {
7450 0 : bio_put(bio);
7451 : atomic_dec(&dip->pending_bios);
7452 : goto out_err;
7453 : }
7454 :
7455 0 : start_sector += submit_len >> 9;
7456 0 : file_offset += submit_len;
7457 :
7458 : submit_len = 0;
7459 : nr_pages = 0;
7460 :
7461 0 : bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev,
7462 : start_sector, GFP_NOFS);
7463 0 : if (!bio)
7464 : goto out_err;
7465 0 : bio->bi_private = dip;
7466 0 : bio->bi_end_io = btrfs_end_dio_bio;
7467 :
7468 0 : map_length = orig_bio->bi_iter.bi_size;
7469 0 : ret = btrfs_map_block(root->fs_info, rw,
7470 : start_sector << 9,
7471 : &map_length, NULL, 0);
7472 0 : if (ret) {
7473 0 : bio_put(bio);
7474 0 : goto out_err;
7475 : }
7476 : } else {
7477 0 : submit_len += bvec->bv_len;
7478 : nr_pages++;
7479 0 : bvec++;
7480 : }
7481 : }
7482 :
7483 : submit:
7484 25259 : ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum,
7485 : async_submit);
7486 25260 : if (!ret)
7487 : return 0;
7488 :
7489 0 : bio_put(bio);
7490 : out_err:
7491 0 : dip->errors = 1;
7492 : /*
7493 : * before atomic variable goto zero, we must
7494 : * make sure dip->errors is perceived to be set.
7495 : */
7496 0 : smp_mb__before_atomic();
7497 0 : if (atomic_dec_and_test(&dip->pending_bios))
7498 0 : bio_io_error(dip->orig_bio);
7499 :
7500 : /* bio_end_io() will handle error, so we needn't return it */
7501 : return 0;
7502 : }
7503 :
7504 25241 : static void btrfs_submit_direct(int rw, struct bio *dio_bio,
7505 : struct inode *inode, loff_t file_offset)
7506 : {
7507 25241 : struct btrfs_root *root = BTRFS_I(inode)->root;
7508 : struct btrfs_dio_private *dip;
7509 : struct bio *io_bio;
7510 : int skip_sum;
7511 : int sum_len;
7512 25241 : int write = rw & REQ_WRITE;
7513 : int ret = 0;
7514 : u16 csum_size;
7515 :
7516 25241 : skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
7517 :
7518 25241 : io_bio = btrfs_bio_clone(dio_bio, GFP_NOFS);
7519 25257 : if (!io_bio) {
7520 : ret = -ENOMEM;
7521 : goto free_ordered;
7522 : }
7523 :
7524 25253 : if (!skip_sum && !write) {
7525 0 : csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
7526 0 : sum_len = dio_bio->bi_iter.bi_size >>
7527 0 : inode->i_sb->s_blocksize_bits;
7528 0 : sum_len *= csum_size;
7529 : } else {
7530 : sum_len = 0;
7531 : }
7532 :
7533 25253 : dip = kmalloc(sizeof(*dip) + sum_len, GFP_NOFS);
7534 25258 : if (!dip) {
7535 : ret = -ENOMEM;
7536 : goto free_io_bio;
7537 : }
7538 :
7539 25258 : dip->private = dio_bio->bi_private;
7540 25258 : dip->inode = inode;
7541 25258 : dip->logical_offset = file_offset;
7542 25258 : dip->bytes = dio_bio->bi_iter.bi_size;
7543 25258 : dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
7544 25258 : io_bio->bi_private = dip;
7545 25258 : dip->errors = 0;
7546 25258 : dip->orig_bio = io_bio;
7547 25258 : dip->dio_bio = dio_bio;
7548 : atomic_set(&dip->pending_bios, 0);
7549 :
7550 25258 : if (write)
7551 25258 : io_bio->bi_end_io = btrfs_endio_direct_write;
7552 : else
7553 0 : io_bio->bi_end_io = btrfs_endio_direct_read;
7554 :
7555 25258 : ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
7556 25260 : if (!ret)
7557 25260 : return;
7558 :
7559 : free_io_bio:
7560 0 : bio_put(io_bio);
7561 :
7562 : free_ordered:
7563 : /*
7564 : * If this is a write, we need to clean up the reserved space and kill
7565 : * the ordered extent.
7566 : */
7567 0 : if (write) {
7568 : struct btrfs_ordered_extent *ordered;
7569 0 : ordered = btrfs_lookup_ordered_extent(inode, file_offset);
7570 0 : if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
7571 : !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
7572 0 : btrfs_free_reserved_extent(root, ordered->start,
7573 : ordered->disk_len, 1);
7574 0 : btrfs_put_ordered_extent(ordered);
7575 0 : btrfs_put_ordered_extent(ordered);
7576 : }
7577 0 : bio_endio(dio_bio, ret);
7578 : }
7579 :
7580 25263 : static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb,
7581 : const struct iov_iter *iter, loff_t offset)
7582 : {
7583 : int seg;
7584 : int i;
7585 25263 : unsigned blocksize_mask = root->sectorsize - 1;
7586 : ssize_t retval = -EINVAL;
7587 :
7588 25263 : if (offset & blocksize_mask)
7589 : goto out;
7590 :
7591 25262 : if (iov_iter_alignment(iter) & blocksize_mask)
7592 : goto out;
7593 :
7594 : /* If this is a write we don't need to check anymore */
7595 25265 : if (rw & WRITE)
7596 : return 0;
7597 : /*
7598 : * Check to make sure we don't have duplicate iov_base's in this
7599 : * iovec, if so return EINVAL, otherwise we'll get csum errors
7600 : * when reading back.
7601 : */
7602 0 : for (seg = 0; seg < iter->nr_segs; seg++) {
7603 0 : for (i = seg + 1; i < iter->nr_segs; i++) {
7604 0 : if (iter->iov[seg].iov_base == iter->iov[i].iov_base)
7605 : goto out;
7606 : }
7607 : }
7608 : retval = 0;
7609 : out:
7610 : return retval;
7611 : }
7612 :
7613 25262 : static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
7614 25270 : struct iov_iter *iter, loff_t offset)
7615 : {
7616 25262 : struct file *file = iocb->ki_filp;
7617 25262 : struct inode *inode = file->f_mapping->host;
7618 : size_t count = 0;
7619 : int flags = 0;
7620 : bool wakeup = true;
7621 : bool relock = false;
7622 : ssize_t ret;
7623 :
7624 25262 : if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iter, offset))
7625 : return 0;
7626 :
7627 25265 : atomic_inc(&inode->i_dio_count);
7628 25268 : smp_mb__after_atomic();
7629 :
7630 : /*
7631 : * The generic stuff only does filemap_write_and_wait_range, which
7632 : * isn't enough if we've written compressed pages to this area, so
7633 : * we need to flush the dirty pages again to make absolutely sure
7634 : * that any outstanding dirty pages are on disk.
7635 : */
7636 : count = iov_iter_count(iter);
7637 25270 : if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
7638 : &BTRFS_I(inode)->runtime_flags))
7639 0 : filemap_fdatawrite_range(inode->i_mapping, offset,
7640 0 : offset + count - 1);
7641 :
7642 25270 : if (rw & WRITE) {
7643 : /*
7644 : * If the write DIO is beyond the EOF, we need update
7645 : * the isize, but it is protected by i_mutex. So we can
7646 : * not unlock the i_mutex at this case.
7647 : */
7648 25270 : if (offset + count <= inode->i_size) {
7649 21447 : mutex_unlock(&inode->i_mutex);
7650 : relock = true;
7651 : }
7652 25270 : ret = btrfs_delalloc_reserve_space(inode, count);
7653 25270 : if (ret)
7654 : goto out;
7655 0 : } else if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
7656 : &BTRFS_I(inode)->runtime_flags))) {
7657 0 : inode_dio_done(inode);
7658 : flags = DIO_LOCKING | DIO_SKIP_HOLES;
7659 : wakeup = false;
7660 : }
7661 :
7662 25261 : ret = __blockdev_direct_IO(rw, iocb, inode,
7663 25261 : BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
7664 : iter, offset, btrfs_get_blocks_direct, NULL,
7665 : btrfs_submit_direct, flags);
7666 25258 : if (rw & WRITE) {
7667 25257 : if (ret < 0 && ret != -EIOCBQUEUED)
7668 0 : btrfs_delalloc_release_space(inode, count);
7669 25257 : else if (ret >= 0 && (size_t)ret < count)
7670 0 : btrfs_delalloc_release_space(inode,
7671 0 : count - (size_t)ret);
7672 : else
7673 25257 : btrfs_delalloc_release_metadata(inode, 0);
7674 : }
7675 : out:
7676 25269 : if (wakeup)
7677 25269 : inode_dio_done(inode);
7678 25269 : if (relock)
7679 21447 : mutex_lock(&inode->i_mutex);
7680 :
7681 25269 : return ret;
7682 : }
7683 :
7684 : #define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC)
7685 :
7686 851 : static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
7687 : __u64 start, __u64 len)
7688 : {
7689 : int ret;
7690 :
7691 851 : ret = fiemap_check_flags(fieinfo, BTRFS_FIEMAP_FLAGS);
7692 851 : if (ret)
7693 : return ret;
7694 :
7695 344 : return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap);
7696 : }
7697 :
7698 26884 : int btrfs_readpage(struct file *file, struct page *page)
7699 : {
7700 : struct extent_io_tree *tree;
7701 27771 : tree = &BTRFS_I(page->mapping->host)->io_tree;
7702 27771 : return extent_read_full_page(tree, page, btrfs_get_extent, 0);
7703 : }
7704 :
7705 0 : static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
7706 : {
7707 : struct extent_io_tree *tree;
7708 :
7709 :
7710 0 : if (current->flags & PF_MEMALLOC) {
7711 0 : redirty_page_for_writepage(wbc, page);
7712 0 : unlock_page(page);
7713 0 : return 0;
7714 : }
7715 0 : tree = &BTRFS_I(page->mapping->host)->io_tree;
7716 0 : return extent_write_full_page(tree, page, btrfs_get_extent, wbc);
7717 : }
7718 :
7719 38223 : static int btrfs_writepages(struct address_space *mapping,
7720 : struct writeback_control *wbc)
7721 : {
7722 : struct extent_io_tree *tree;
7723 :
7724 38223 : tree = &BTRFS_I(mapping->host)->io_tree;
7725 38223 : return extent_writepages(tree, mapping, btrfs_get_extent, wbc);
7726 : }
7727 :
7728 : static int
7729 24097 : btrfs_readpages(struct file *file, struct address_space *mapping,
7730 : struct list_head *pages, unsigned nr_pages)
7731 : {
7732 : struct extent_io_tree *tree;
7733 24097 : tree = &BTRFS_I(mapping->host)->io_tree;
7734 24097 : return extent_readpages(tree, mapping, pages, nr_pages,
7735 : btrfs_get_extent);
7736 : }
7737 188223 : static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
7738 : {
7739 : struct extent_io_tree *tree;
7740 : struct extent_map_tree *map;
7741 : int ret;
7742 :
7743 188223 : tree = &BTRFS_I(page->mapping->host)->io_tree;
7744 188223 : map = &BTRFS_I(page->mapping->host)->extent_tree;
7745 188223 : ret = try_release_extent_mapping(map, tree, page, gfp_flags);
7746 188224 : if (ret == 1) {
7747 : ClearPagePrivate(page);
7748 188224 : set_page_private(page, 0);
7749 188224 : page_cache_release(page);
7750 : }
7751 188224 : return ret;
7752 : }
7753 :
7754 1568 : static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
7755 : {
7756 3136 : if (PageWriteback(page) || PageDirty(page))
7757 : return 0;
7758 1527 : return __btrfs_releasepage(page, gfp_flags & GFP_NOFS);
7759 : }
7760 :
7761 3512376 : static void btrfs_invalidatepage(struct page *page, unsigned int offset,
7762 : unsigned int length)
7763 : {
7764 1756188 : struct inode *inode = page->mapping->host;
7765 : struct extent_io_tree *tree;
7766 : struct btrfs_ordered_extent *ordered;
7767 1756188 : struct extent_state *cached_state = NULL;
7768 1756188 : u64 page_start = page_offset(page);
7769 1756188 : u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
7770 1756188 : int inode_evicting = inode->i_state & I_FREEING;
7771 :
7772 : /*
7773 : * we have the page locked, so new writeback can't start,
7774 : * and the dirty bit won't be cleared while we are here.
7775 : *
7776 : * Wait for IO on this page so that we can safely clear
7777 : * the PagePrivate2 bit and do ordered accounting
7778 : */
7779 1756188 : wait_on_page_writeback(page);
7780 :
7781 1756188 : tree = &BTRFS_I(inode)->io_tree;
7782 1756188 : if (offset) {
7783 292 : btrfs_releasepage(page, GFP_NOFS);
7784 1756482 : return;
7785 : }
7786 :
7787 1755896 : if (!inode_evicting)
7788 186696 : lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
7789 1755896 : ordered = btrfs_lookup_ordered_extent(inode, page_start);
7790 1755897 : if (ordered) {
7791 : /*
7792 : * IO on this page will never be started, so we need
7793 : * to account for any ordered extents now
7794 : */
7795 0 : if (!inode_evicting)
7796 0 : clear_extent_bit(tree, page_start, page_end,
7797 : EXTENT_DIRTY | EXTENT_DELALLOC |
7798 : EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
7799 : EXTENT_DEFRAG, 1, 0, &cached_state,
7800 : GFP_NOFS);
7801 : /*
7802 : * whoever cleared the private bit is responsible
7803 : * for the finish_ordered_io
7804 : */
7805 0 : if (TestClearPagePrivate2(page)) {
7806 : struct btrfs_ordered_inode_tree *tree;
7807 : u64 new_len;
7808 :
7809 : tree = &BTRFS_I(inode)->ordered_tree;
7810 :
7811 : spin_lock_irq(&tree->lock);
7812 0 : set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
7813 0 : new_len = page_start - ordered->file_offset;
7814 0 : if (new_len < ordered->truncated_len)
7815 0 : ordered->truncated_len = new_len;
7816 : spin_unlock_irq(&tree->lock);
7817 :
7818 0 : if (btrfs_dec_test_ordered_pending(inode, &ordered,
7819 : page_start,
7820 : PAGE_CACHE_SIZE, 1))
7821 0 : btrfs_finish_ordered_io(ordered);
7822 : }
7823 0 : btrfs_put_ordered_extent(ordered);
7824 0 : if (!inode_evicting) {
7825 0 : cached_state = NULL;
7826 0 : lock_extent_bits(tree, page_start, page_end, 0,
7827 : &cached_state);
7828 : }
7829 : }
7830 :
7831 1755898 : if (!inode_evicting) {
7832 186697 : clear_extent_bit(tree, page_start, page_end,
7833 : EXTENT_LOCKED | EXTENT_DIRTY |
7834 : EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
7835 : EXTENT_DEFRAG, 1, 1,
7836 : &cached_state, GFP_NOFS);
7837 :
7838 186696 : __btrfs_releasepage(page, GFP_NOFS);
7839 : }
7840 :
7841 : ClearPageChecked(page);
7842 1755898 : if (PagePrivate(page)) {
7843 : ClearPagePrivate(page);
7844 1569201 : set_page_private(page, 0);
7845 1569201 : page_cache_release(page);
7846 : }
7847 : }
7848 :
7849 : /*
7850 : * btrfs_page_mkwrite() is not allowed to change the file size as it gets
7851 : * called from a page fault handler when a page is first dirtied. Hence we must
7852 : * be careful to check for EOF conditions here. We set the page up correctly
7853 : * for a written page which means we get ENOSPC checking when writing into
7854 : * holes and correct delalloc and unwritten extent mapping on filesystems that
7855 : * support these features.
7856 : *
7857 : * We are not allowed to take the i_mutex here so we have to play games to
7858 : * protect against truncate races as the page could now be beyond EOF. Because
7859 : * vmtruncate() writes the inode size before removing pages, once we have the
7860 : * page lock we can determine safely if the page is beyond EOF. If it is not
7861 : * beyond EOF, then the page is guaranteed safe against truncation until we
7862 : * unlock the page.
7863 : */
7864 6081 : int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
7865 : {
7866 12166 : struct page *page = vmf->page;
7867 12166 : struct inode *inode = file_inode(vma->vm_file);
7868 6081 : struct btrfs_root *root = BTRFS_I(inode)->root;
7869 6081 : struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
7870 : struct btrfs_ordered_extent *ordered;
7871 6081 : struct extent_state *cached_state = NULL;
7872 : char *kaddr;
7873 : unsigned long zero_start;
7874 : loff_t size;
7875 : int ret;
7876 : int reserved = 0;
7877 : u64 page_start;
7878 : u64 page_end;
7879 :
7880 6081 : sb_start_pagefault(inode->i_sb);
7881 6081 : ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
7882 6081 : if (!ret) {
7883 6081 : ret = file_update_time(vma->vm_file);
7884 : reserved = 1;
7885 : }
7886 6081 : if (ret) {
7887 0 : if (ret == -ENOMEM)
7888 : ret = VM_FAULT_OOM;
7889 : else /* -ENOSPC, -EIO, etc */
7890 : ret = VM_FAULT_SIGBUS;
7891 0 : if (reserved)
7892 : goto out;
7893 : goto out_noreserve;
7894 : }
7895 :
7896 : ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
7897 : again:
7898 6085 : lock_page(page);
7899 : size = i_size_read(inode);
7900 6085 : page_start = page_offset(page);
7901 6085 : page_end = page_start + PAGE_CACHE_SIZE - 1;
7902 :
7903 12170 : if ((page->mapping != inode->i_mapping) ||
7904 6085 : (page_start >= size)) {
7905 : /* page got truncated out from underneath us */
7906 : goto out_unlock;
7907 : }
7908 6085 : wait_on_page_writeback(page);
7909 :
7910 6085 : lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state);
7911 6085 : set_page_extent_mapped(page);
7912 :
7913 : /*
7914 : * we can't set the delalloc bits if there are pending ordered
7915 : * extents. Drop our locks and wait for them to finish
7916 : */
7917 6085 : ordered = btrfs_lookup_ordered_extent(inode, page_start);
7918 6085 : if (ordered) {
7919 4 : unlock_extent_cached(io_tree, page_start, page_end,
7920 : &cached_state, GFP_NOFS);
7921 4 : unlock_page(page);
7922 4 : btrfs_start_ordered_extent(inode, ordered, 1);
7923 4 : btrfs_put_ordered_extent(ordered);
7924 4 : goto again;
7925 : }
7926 :
7927 : /*
7928 : * XXX - page_mkwrite gets called every time the page is dirtied, even
7929 : * if it was already dirty, so for space accounting reasons we need to
7930 : * clear any delalloc bits for the range we are fixing to save. There
7931 : * is probably a better way to do this, but for now keep consistent with
7932 : * prepare_pages in the normal write path.
7933 : */
7934 6081 : clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
7935 : EXTENT_DIRTY | EXTENT_DELALLOC |
7936 : EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
7937 : 0, 0, &cached_state, GFP_NOFS);
7938 :
7939 6081 : ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
7940 : &cached_state);
7941 6081 : if (ret) {
7942 0 : unlock_extent_cached(io_tree, page_start, page_end,
7943 : &cached_state, GFP_NOFS);
7944 : ret = VM_FAULT_SIGBUS;
7945 0 : goto out_unlock;
7946 : }
7947 : ret = 0;
7948 :
7949 : /* page is wholly or partially inside EOF */
7950 6081 : if (page_start + PAGE_CACHE_SIZE > size)
7951 0 : zero_start = size & ~PAGE_CACHE_MASK;
7952 : else
7953 : zero_start = PAGE_CACHE_SIZE;
7954 :
7955 6081 : if (zero_start != PAGE_CACHE_SIZE) {
7956 : kaddr = kmap(page);
7957 0 : memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start);
7958 : flush_dcache_page(page);
7959 : kunmap(page);
7960 : }
7961 : ClearPageChecked(page);
7962 6081 : set_page_dirty(page);
7963 : SetPageUptodate(page);
7964 :
7965 6081 : BTRFS_I(inode)->last_trans = root->fs_info->generation;
7966 6081 : BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
7967 6081 : BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
7968 :
7969 6081 : unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
7970 :
7971 : out_unlock:
7972 6081 : if (!ret) {
7973 6081 : sb_end_pagefault(inode->i_sb);
7974 6081 : return VM_FAULT_LOCKED;
7975 : }
7976 0 : unlock_page(page);
7977 : out:
7978 0 : btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
7979 : out_noreserve:
7980 0 : sb_end_pagefault(inode->i_sb);
7981 0 : return ret;
7982 : }
7983 :
7984 2723 : static int btrfs_truncate(struct inode *inode)
7985 : {
7986 5446 : struct btrfs_root *root = BTRFS_I(inode)->root;
7987 : struct btrfs_block_rsv *rsv;
7988 : int ret = 0;
7989 : int err = 0;
7990 : struct btrfs_trans_handle *trans;
7991 2723 : u64 mask = root->sectorsize - 1;
7992 : u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
7993 :
7994 2723 : ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask),
7995 : (u64)-1);
7996 2723 : if (ret)
7997 : return ret;
7998 :
7999 : /*
8000 : * Yes ladies and gentelment, this is indeed ugly. The fact is we have
8001 : * 3 things going on here
8002 : *
8003 : * 1) We need to reserve space for our orphan item and the space to
8004 : * delete our orphan item. Lord knows we don't want to have a dangling
8005 : * orphan item because we didn't reserve space to remove it.
8006 : *
8007 : * 2) We need to reserve space to update our inode.
8008 : *
8009 : * 3) We need to have something to cache all the space that is going to
8010 : * be free'd up by the truncate operation, but also have some slack
8011 : * space reserved in case it uses space during the truncate (thank you
8012 : * very much snapshotting).
8013 : *
8014 : * And we need these to all be seperate. The fact is we can use alot of
8015 : * space doing the truncate, and we have no earthly idea how much space
8016 : * we will use, so we need the truncate reservation to be seperate so it
8017 : * doesn't end up using space reserved for updating the inode or
8018 : * removing the orphan item. We also need to be able to stop the
8019 : * transaction and start a new one, which means we need to be able to
8020 : * update the inode several times, and we have no idea of knowing how
8021 : * many times that will be, so we can't just reserve 1 item for the
8022 : * entirety of the opration, so that has to be done seperately as well.
8023 : * Then there is the orphan item, which does indeed need to be held on
8024 : * to for the whole operation, and we need nobody to touch this reserved
8025 : * space except the orphan code.
8026 : *
8027 : * So that leaves us with
8028 : *
8029 : * 1) root->orphan_block_rsv - for the orphan deletion.
8030 : * 2) rsv - for the truncate reservation, which we will steal from the
8031 : * transaction reservation.
8032 : * 3) fs_info->trans_block_rsv - this will have 1 items worth left for
8033 : * updating the inode.
8034 : */
8035 2723 : rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
8036 2723 : if (!rsv)
8037 : return -ENOMEM;
8038 2723 : rsv->size = min_size;
8039 2723 : rsv->failfast = 1;
8040 :
8041 : /*
8042 : * 1 for the truncate slack space
8043 : * 1 for updating the inode.
8044 : */
8045 2723 : trans = btrfs_start_transaction(root, 2);
8046 2723 : if (IS_ERR(trans)) {
8047 0 : err = PTR_ERR(trans);
8048 0 : goto out;
8049 : }
8050 :
8051 : /* Migrate the slack space for the truncate to our reserve */
8052 2723 : ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
8053 : min_size);
8054 2723 : BUG_ON(ret);
8055 :
8056 : /*
8057 : * So if we truncate and then write and fsync we normally would just
8058 : * write the extents that changed, which is a problem if we need to
8059 : * first truncate that entire inode. So set this flag so we write out
8060 : * all of the extents in the inode to the sync log so we're completely
8061 : * safe.
8062 : */
8063 : set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
8064 2723 : trans->block_rsv = rsv;
8065 :
8066 : while (1) {
8067 2723 : ret = btrfs_truncate_inode_items(trans, root, inode,
8068 2723 : inode->i_size,
8069 : BTRFS_EXTENT_DATA_KEY);
8070 2723 : if (ret != -ENOSPC) {
8071 : err = ret;
8072 : break;
8073 : }
8074 :
8075 0 : trans->block_rsv = &root->fs_info->trans_block_rsv;
8076 0 : ret = btrfs_update_inode(trans, root, inode);
8077 0 : if (ret) {
8078 : err = ret;
8079 : break;
8080 : }
8081 :
8082 0 : btrfs_end_transaction(trans, root);
8083 0 : btrfs_btree_balance_dirty(root);
8084 :
8085 0 : trans = btrfs_start_transaction(root, 2);
8086 0 : if (IS_ERR(trans)) {
8087 0 : ret = err = PTR_ERR(trans);
8088 : trans = NULL;
8089 0 : break;
8090 : }
8091 :
8092 0 : ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
8093 : rsv, min_size);
8094 0 : BUG_ON(ret); /* shouldn't happen */
8095 0 : trans->block_rsv = rsv;
8096 0 : }
8097 :
8098 2723 : if (ret == 0 && inode->i_nlink > 0) {
8099 2723 : trans->block_rsv = root->orphan_block_rsv;
8100 2723 : ret = btrfs_orphan_del(trans, inode);
8101 2723 : if (ret)
8102 : err = ret;
8103 : }
8104 :
8105 2723 : if (trans) {
8106 2723 : trans->block_rsv = &root->fs_info->trans_block_rsv;
8107 2723 : ret = btrfs_update_inode(trans, root, inode);
8108 2723 : if (ret && !err)
8109 : err = ret;
8110 :
8111 2723 : ret = btrfs_end_transaction(trans, root);
8112 2723 : btrfs_btree_balance_dirty(root);
8113 : }
8114 :
8115 : out:
8116 2723 : btrfs_free_block_rsv(root, rsv);
8117 :
8118 2723 : if (ret && !err)
8119 : err = ret;
8120 :
8121 2723 : return err;
8122 : }
8123 :
8124 : /*
8125 : * create a new subvolume directory/inode (helper for the ioctl).
8126 : */
8127 49 : int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
8128 : struct btrfs_root *new_root,
8129 : struct btrfs_root *parent_root,
8130 : u64 new_dirid)
8131 : {
8132 : struct inode *inode;
8133 : int err;
8134 49 : u64 index = 0;
8135 :
8136 49 : inode = btrfs_new_inode(trans, new_root, NULL, "..", 2,
8137 : new_dirid, new_dirid,
8138 49 : S_IFDIR | (~current_umask() & S_IRWXUGO),
8139 : &index);
8140 49 : if (IS_ERR(inode))
8141 0 : return PTR_ERR(inode);
8142 49 : inode->i_op = &btrfs_dir_inode_operations;
8143 49 : inode->i_fop = &btrfs_dir_file_operations;
8144 :
8145 49 : set_nlink(inode, 1);
8146 : btrfs_i_size_write(inode, 0);
8147 49 : unlock_new_inode(inode);
8148 :
8149 49 : err = btrfs_subvol_inherit_props(trans, new_root, parent_root);
8150 49 : if (err)
8151 0 : btrfs_err(new_root->fs_info,
8152 : "error inheriting subvolume %llu properties: %d",
8153 : new_root->root_key.objectid, err);
8154 :
8155 49 : err = btrfs_update_inode(trans, new_root, inode);
8156 :
8157 49 : iput(inode);
8158 49 : return err;
8159 : }
8160 :
8161 25711 : struct inode *btrfs_alloc_inode(struct super_block *sb)
8162 : {
8163 : struct btrfs_inode *ei;
8164 : struct inode *inode;
8165 :
8166 25711 : ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
8167 25711 : if (!ei)
8168 : return NULL;
8169 :
8170 25709 : ei->root = NULL;
8171 25709 : ei->generation = 0;
8172 25709 : ei->last_trans = 0;
8173 25709 : ei->last_sub_trans = 0;
8174 25709 : ei->logged_trans = 0;
8175 25709 : ei->delalloc_bytes = 0;
8176 25709 : ei->disk_i_size = 0;
8177 25709 : ei->flags = 0;
8178 25709 : ei->csum_bytes = 0;
8179 25709 : ei->index_cnt = (u64)-1;
8180 25709 : ei->dir_index = 0;
8181 25709 : ei->last_unlink_trans = 0;
8182 25709 : ei->last_log_commit = 0;
8183 :
8184 25709 : spin_lock_init(&ei->lock);
8185 25709 : ei->outstanding_extents = 0;
8186 25709 : ei->reserved_extents = 0;
8187 :
8188 25709 : ei->runtime_flags = 0;
8189 25709 : ei->force_compress = BTRFS_COMPRESS_NONE;
8190 :
8191 25709 : ei->delayed_node = NULL;
8192 :
8193 25709 : inode = &ei->vfs_inode;
8194 25709 : extent_map_tree_init(&ei->extent_tree);
8195 25707 : extent_io_tree_init(&ei->io_tree, &inode->i_data);
8196 25707 : extent_io_tree_init(&ei->io_failure_tree, &inode->i_data);
8197 25706 : ei->io_tree.track_uptodate = 1;
8198 25706 : ei->io_failure_tree.track_uptodate = 1;
8199 : atomic_set(&ei->sync_writers, 0);
8200 25706 : mutex_init(&ei->log_mutex);
8201 25707 : mutex_init(&ei->delalloc_mutex);
8202 : btrfs_ordered_inode_tree_init(&ei->ordered_tree);
8203 25707 : INIT_LIST_HEAD(&ei->delalloc_inodes);
8204 25707 : RB_CLEAR_NODE(&ei->rb_node);
8205 :
8206 25707 : return inode;
8207 : }
8208 :
8209 : #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
8210 : void btrfs_test_destroy_inode(struct inode *inode)
8211 : {
8212 : btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
8213 : kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
8214 : }
8215 : #endif
8216 :
8217 25704 : static void btrfs_i_callback(struct rcu_head *head)
8218 : {
8219 : struct inode *inode = container_of(head, struct inode, i_rcu);
8220 25704 : kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
8221 25704 : }
8222 :
8223 25704 : void btrfs_destroy_inode(struct inode *inode)
8224 : {
8225 : struct btrfs_ordered_extent *ordered;
8226 25704 : struct btrfs_root *root = BTRFS_I(inode)->root;
8227 :
8228 25704 : WARN_ON(!hlist_empty(&inode->i_dentry));
8229 25704 : WARN_ON(inode->i_data.nrpages);
8230 25704 : WARN_ON(BTRFS_I(inode)->outstanding_extents);
8231 25704 : WARN_ON(BTRFS_I(inode)->reserved_extents);
8232 25704 : WARN_ON(BTRFS_I(inode)->delalloc_bytes);
8233 25704 : WARN_ON(BTRFS_I(inode)->csum_bytes);
8234 :
8235 : /*
8236 : * This can happen where we create an inode, but somebody else also
8237 : * created the same inode and we need to destroy the one we already
8238 : * created.
8239 : */
8240 25704 : if (!root)
8241 : goto free;
8242 :
8243 25704 : if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
8244 : &BTRFS_I(inode)->runtime_flags)) {
8245 0 : btrfs_info(root->fs_info, "inode %llu still on the orphan list",
8246 : btrfs_ino(inode));
8247 0 : atomic_dec(&root->orphan_inodes);
8248 : }
8249 :
8250 : while (1) {
8251 25704 : ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
8252 25704 : if (!ordered)
8253 : break;
8254 : else {
8255 0 : btrfs_err(root->fs_info, "found ordered extent %llu %llu on inode cleanup",
8256 : ordered->file_offset, ordered->len);
8257 0 : btrfs_remove_ordered_extent(inode, ordered);
8258 0 : btrfs_put_ordered_extent(ordered);
8259 0 : btrfs_put_ordered_extent(ordered);
8260 : }
8261 0 : }
8262 25704 : inode_tree_del(inode);
8263 25704 : btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
8264 : free:
8265 25704 : call_rcu(&inode->i_rcu, btrfs_i_callback);
8266 25704 : }
8267 :
8268 85705 : int btrfs_drop_inode(struct inode *inode)
8269 : {
8270 85705 : struct btrfs_root *root = BTRFS_I(inode)->root;
8271 :
8272 85705 : if (root == NULL)
8273 : return 1;
8274 :
8275 : /* the snap/subvol tree is on deleting */
8276 85704 : if (btrfs_root_refs(&root->root_item) == 0)
8277 : return 1;
8278 : else
8279 85649 : return generic_drop_inode(inode);
8280 : }
8281 :
8282 15021 : static void init_once(void *foo)
8283 : {
8284 : struct btrfs_inode *ei = (struct btrfs_inode *) foo;
8285 :
8286 15021 : inode_init_once(&ei->vfs_inode);
8287 15021 : }
8288 :
8289 0 : void btrfs_destroy_cachep(void)
8290 : {
8291 : /*
8292 : * Make sure all delayed rcu free inodes are flushed before we
8293 : * destroy cache.
8294 : */
8295 0 : rcu_barrier();
8296 0 : if (btrfs_inode_cachep)
8297 0 : kmem_cache_destroy(btrfs_inode_cachep);
8298 0 : if (btrfs_trans_handle_cachep)
8299 0 : kmem_cache_destroy(btrfs_trans_handle_cachep);
8300 0 : if (btrfs_transaction_cachep)
8301 0 : kmem_cache_destroy(btrfs_transaction_cachep);
8302 0 : if (btrfs_path_cachep)
8303 0 : kmem_cache_destroy(btrfs_path_cachep);
8304 0 : if (btrfs_free_space_cachep)
8305 0 : kmem_cache_destroy(btrfs_free_space_cachep);
8306 0 : if (btrfs_delalloc_work_cachep)
8307 0 : kmem_cache_destroy(btrfs_delalloc_work_cachep);
8308 0 : }
8309 :
8310 0 : int btrfs_init_cachep(void)
8311 : {
8312 0 : btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
8313 : sizeof(struct btrfs_inode), 0,
8314 : SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once);
8315 0 : if (!btrfs_inode_cachep)
8316 : goto fail;
8317 :
8318 0 : btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",
8319 : sizeof(struct btrfs_trans_handle), 0,
8320 : SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
8321 0 : if (!btrfs_trans_handle_cachep)
8322 : goto fail;
8323 :
8324 0 : btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction",
8325 : sizeof(struct btrfs_transaction), 0,
8326 : SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
8327 0 : if (!btrfs_transaction_cachep)
8328 : goto fail;
8329 :
8330 0 : btrfs_path_cachep = kmem_cache_create("btrfs_path",
8331 : sizeof(struct btrfs_path), 0,
8332 : SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
8333 0 : if (!btrfs_path_cachep)
8334 : goto fail;
8335 :
8336 0 : btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space",
8337 : sizeof(struct btrfs_free_space), 0,
8338 : SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
8339 0 : if (!btrfs_free_space_cachep)
8340 : goto fail;
8341 :
8342 0 : btrfs_delalloc_work_cachep = kmem_cache_create("btrfs_delalloc_work",
8343 : sizeof(struct btrfs_delalloc_work), 0,
8344 : SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
8345 : NULL);
8346 0 : if (!btrfs_delalloc_work_cachep)
8347 : goto fail;
8348 :
8349 : return 0;
8350 : fail:
8351 0 : btrfs_destroy_cachep();
8352 0 : return -ENOMEM;
8353 : }
8354 :
8355 853993 : static int btrfs_getattr(struct vfsmount *mnt,
8356 : struct dentry *dentry, struct kstat *stat)
8357 : {
8358 : u64 delalloc_bytes;
8359 853993 : struct inode *inode = dentry->d_inode;
8360 853993 : u32 blocksize = inode->i_sb->s_blocksize;
8361 :
8362 853993 : generic_fillattr(inode, stat);
8363 854030 : stat->dev = BTRFS_I(inode)->root->anon_dev;
8364 854030 : stat->blksize = PAGE_CACHE_SIZE;
8365 :
8366 : spin_lock(&BTRFS_I(inode)->lock);
8367 854129 : delalloc_bytes = BTRFS_I(inode)->delalloc_bytes;
8368 : spin_unlock(&BTRFS_I(inode)->lock);
8369 2562401 : stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) +
8370 1708322 : ALIGN(delalloc_bytes, blocksize)) >> 9;
8371 854161 : return 0;
8372 : }
8373 :
8374 2315 : static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
8375 : struct inode *new_dir, struct dentry *new_dentry)
8376 : {
8377 1 : struct btrfs_trans_handle *trans;
8378 2315 : struct btrfs_root *root = BTRFS_I(old_dir)->root;
8379 2315 : struct btrfs_root *dest = BTRFS_I(new_dir)->root;
8380 2315 : struct inode *new_inode = new_dentry->d_inode;
8381 2315 : struct inode *old_inode = old_dentry->d_inode;
8382 2315 : struct timespec ctime = CURRENT_TIME;
8383 2315 : u64 index = 0;
8384 : u64 root_objectid;
8385 : int ret;
8386 : u64 old_ino = btrfs_ino(old_inode);
8387 :
8388 2315 : if (btrfs_ino(new_dir) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
8389 : return -EPERM;
8390 :
8391 : /* we only allow rename subvolume link between subvolumes */
8392 2315 : if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
8393 : return -EXDEV;
8394 :
8395 2312 : if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
8396 12 : (new_inode && btrfs_ino(new_inode) == BTRFS_FIRST_FREE_OBJECTID))
8397 : return -ENOTEMPTY;
8398 :
8399 2312 : if (S_ISDIR(old_inode->i_mode) && new_inode &&
8400 0 : new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
8401 : return -ENOTEMPTY;
8402 :
8403 :
8404 : /* check for collisions, even if the name isn't there */
8405 4624 : ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino,
8406 2312 : new_dentry->d_name.name,
8407 2312 : new_dentry->d_name.len);
8408 :
8409 2312 : if (ret) {
8410 12 : if (ret == -EEXIST) {
8411 : /* we shouldn't get
8412 : * eexist without a new_inode */
8413 12 : if (WARN_ON(!new_inode)) {
8414 : return ret;
8415 : }
8416 : } else {
8417 : /* maybe -EOVERFLOW */
8418 : return ret;
8419 : }
8420 : }
8421 : ret = 0;
8422 :
8423 : /*
8424 : * we're using rename to replace one file with another. Start IO on it
8425 : * now so we don't add too much work to the end of the transaction
8426 : */
8427 2312 : if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
8428 12 : filemap_flush(old_inode->i_mapping);
8429 :
8430 : /* close the racy window with snapshot create/destroy ioctl */
8431 2312 : if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
8432 1 : down_read(&root->fs_info->subvol_sem);
8433 : /*
8434 : * We want to reserve the absolute worst case amount of items. So if
8435 : * both inodes are subvols and we need to unlink them then that would
8436 : * require 4 item modifications, but if they are both normal inodes it
8437 : * would require 5 item modifications, so we'll assume their normal
8438 : * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items
8439 : * should cover the worst case number of items we'll modify.
8440 : */
8441 2312 : trans = btrfs_start_transaction(root, 11);
8442 2312 : if (IS_ERR(trans)) {
8443 0 : ret = PTR_ERR(trans);
8444 0 : goto out_notrans;
8445 : }
8446 :
8447 2312 : if (dest != root)
8448 1 : btrfs_record_root_in_trans(trans, dest);
8449 :
8450 2312 : ret = btrfs_set_inode_index(new_dir, &index);
8451 2312 : if (ret)
8452 : goto out_fail;
8453 :
8454 2312 : BTRFS_I(old_inode)->dir_index = 0ULL;
8455 2312 : if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
8456 : /* force full log commit if subvolume involved. */
8457 1 : btrfs_set_log_full_commit(root->fs_info, trans);
8458 : } else {
8459 6933 : ret = btrfs_insert_inode_ref(trans, dest,
8460 2311 : new_dentry->d_name.name,
8461 2311 : new_dentry->d_name.len,
8462 : old_ino,
8463 : btrfs_ino(new_dir), index);
8464 2311 : if (ret)
8465 : goto out_fail;
8466 : /*
8467 : * this is an ugly little race, but the rename is required
8468 : * to make sure that if we crash, the inode is either at the
8469 : * old name or the new one. pinning the log transaction lets
8470 : * us make sure we don't allow a log commit to come in after
8471 : * we unlink the name but before we add the new name back in.
8472 : */
8473 2311 : btrfs_pin_log_trans(root);
8474 : }
8475 :
8476 : inode_inc_iversion(old_dir);
8477 : inode_inc_iversion(new_dir);
8478 : inode_inc_iversion(old_inode);
8479 2312 : old_dir->i_ctime = old_dir->i_mtime = ctime;
8480 2312 : new_dir->i_ctime = new_dir->i_mtime = ctime;
8481 2312 : old_inode->i_ctime = ctime;
8482 :
8483 2312 : if (old_dentry->d_parent != new_dentry->d_parent)
8484 2119 : btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
8485 :
8486 2312 : if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
8487 1 : root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
8488 2 : ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid,
8489 1 : old_dentry->d_name.name,
8490 1 : old_dentry->d_name.len);
8491 : } else {
8492 4622 : ret = __btrfs_unlink_inode(trans, root, old_dir,
8493 : old_dentry->d_inode,
8494 2311 : old_dentry->d_name.name,
8495 2311 : old_dentry->d_name.len);
8496 2311 : if (!ret)
8497 2311 : ret = btrfs_update_inode(trans, root, old_inode);
8498 : }
8499 2312 : if (ret) {
8500 0 : btrfs_abort_transaction(trans, root, ret);
8501 0 : goto out_fail;
8502 : }
8503 :
8504 2312 : if (new_inode) {
8505 : inode_inc_iversion(new_inode);
8506 12 : new_inode->i_ctime = CURRENT_TIME;
8507 12 : if (unlikely(btrfs_ino(new_inode) ==
8508 : BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
8509 : root_objectid = BTRFS_I(new_inode)->location.objectid;
8510 0 : ret = btrfs_unlink_subvol(trans, dest, new_dir,
8511 : root_objectid,
8512 0 : new_dentry->d_name.name,
8513 0 : new_dentry->d_name.len);
8514 0 : BUG_ON(new_inode->i_nlink == 0);
8515 : } else {
8516 24 : ret = btrfs_unlink_inode(trans, dest, new_dir,
8517 : new_dentry->d_inode,
8518 12 : new_dentry->d_name.name,
8519 12 : new_dentry->d_name.len);
8520 : }
8521 12 : if (!ret && new_inode->i_nlink == 0)
8522 11 : ret = btrfs_orphan_add(trans, new_dentry->d_inode);
8523 12 : if (ret) {
8524 0 : btrfs_abort_transaction(trans, root, ret);
8525 0 : goto out_fail;
8526 : }
8527 : }
8528 :
8529 6936 : ret = btrfs_add_link(trans, new_dir, old_inode,
8530 2312 : new_dentry->d_name.name,
8531 2312 : new_dentry->d_name.len, 0, index);
8532 2312 : if (ret) {
8533 0 : btrfs_abort_transaction(trans, root, ret);
8534 0 : goto out_fail;
8535 : }
8536 :
8537 2312 : if (old_inode->i_nlink == 1)
8538 1934 : BTRFS_I(old_inode)->dir_index = index;
8539 :
8540 2312 : if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
8541 2311 : struct dentry *parent = new_dentry->d_parent;
8542 2311 : btrfs_log_new_name(trans, old_inode, old_dir, parent);
8543 2311 : btrfs_end_log_trans(root);
8544 : }
8545 : out_fail:
8546 2312 : btrfs_end_transaction(trans, root);
8547 : out_notrans:
8548 2312 : if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
8549 1 : up_read(&root->fs_info->subvol_sem);
8550 :
8551 2312 : return ret;
8552 : }
8553 :
8554 2315 : static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry,
8555 : struct inode *new_dir, struct dentry *new_dentry,
8556 : unsigned int flags)
8557 : {
8558 2315 : if (flags & ~RENAME_NOREPLACE)
8559 : return -EINVAL;
8560 :
8561 2315 : return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry);
8562 : }
8563 :
8564 72 : static void btrfs_run_delalloc_work(struct btrfs_work *work)
8565 : {
8566 : struct btrfs_delalloc_work *delalloc_work;
8567 : struct inode *inode;
8568 :
8569 : delalloc_work = container_of(work, struct btrfs_delalloc_work,
8570 : work);
8571 72 : inode = delalloc_work->inode;
8572 72 : if (delalloc_work->wait) {
8573 0 : btrfs_wait_ordered_range(inode, 0, (u64)-1);
8574 : } else {
8575 72 : filemap_flush(inode->i_mapping);
8576 72 : if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
8577 : &BTRFS_I(inode)->runtime_flags))
8578 3 : filemap_flush(inode->i_mapping);
8579 : }
8580 :
8581 72 : if (delalloc_work->delay_iput)
8582 0 : btrfs_add_delayed_iput(inode);
8583 : else
8584 72 : iput(inode);
8585 72 : complete(&delalloc_work->completion);
8586 72 : }
8587 :
8588 72 : struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
8589 : int wait, int delay_iput)
8590 : {
8591 : struct btrfs_delalloc_work *work;
8592 :
8593 72 : work = kmem_cache_zalloc(btrfs_delalloc_work_cachep, GFP_NOFS);
8594 72 : if (!work)
8595 : return NULL;
8596 :
8597 : init_completion(&work->completion);
8598 72 : INIT_LIST_HEAD(&work->list);
8599 72 : work->inode = inode;
8600 72 : work->wait = wait;
8601 72 : work->delay_iput = delay_iput;
8602 72 : WARN_ON_ONCE(!inode);
8603 72 : btrfs_init_work(&work->work, btrfs_flush_delalloc_helper,
8604 : btrfs_run_delalloc_work, NULL, NULL);
8605 :
8606 72 : return work;
8607 : }
8608 :
8609 72 : void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
8610 : {
8611 72 : wait_for_completion(&work->completion);
8612 72 : kmem_cache_free(btrfs_delalloc_work_cachep, work);
8613 72 : }
8614 :
8615 : /*
8616 : * some fairly slow code that needs optimization. This walks the list
8617 : * of all the inodes with pending delalloc and forces them to disk.
8618 : */
8619 155 : static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput,
8620 : int nr)
8621 : {
8622 : struct btrfs_inode *binode;
8623 : struct inode *inode;
8624 : struct btrfs_delalloc_work *work, *next;
8625 : struct list_head works;
8626 : struct list_head splice;
8627 : int ret = 0;
8628 :
8629 : INIT_LIST_HEAD(&works);
8630 : INIT_LIST_HEAD(&splice);
8631 :
8632 155 : mutex_lock(&root->delalloc_mutex);
8633 : spin_lock(&root->delalloc_lock);
8634 155 : list_splice_init(&root->delalloc_inodes, &splice);
8635 227 : while (!list_empty(&splice)) {
8636 : binode = list_entry(splice.next, struct btrfs_inode,
8637 : delalloc_inodes);
8638 :
8639 72 : list_move_tail(&binode->delalloc_inodes,
8640 : &root->delalloc_inodes);
8641 72 : inode = igrab(&binode->vfs_inode);
8642 72 : if (!inode) {
8643 0 : cond_resched_lock(&root->delalloc_lock);
8644 0 : continue;
8645 : }
8646 : spin_unlock(&root->delalloc_lock);
8647 :
8648 72 : work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
8649 72 : if (unlikely(!work)) {
8650 0 : if (delay_iput)
8651 0 : btrfs_add_delayed_iput(inode);
8652 : else
8653 0 : iput(inode);
8654 : ret = -ENOMEM;
8655 : goto out;
8656 : }
8657 72 : list_add_tail(&work->list, &works);
8658 72 : btrfs_queue_work(root->fs_info->flush_workers,
8659 : &work->work);
8660 72 : ret++;
8661 72 : if (nr != -1 && ret >= nr)
8662 : goto out;
8663 72 : cond_resched();
8664 : spin_lock(&root->delalloc_lock);
8665 : }
8666 : spin_unlock(&root->delalloc_lock);
8667 :
8668 : out:
8669 227 : list_for_each_entry_safe(work, next, &works, list) {
8670 : list_del_init(&work->list);
8671 72 : btrfs_wait_and_free_delalloc_work(work);
8672 : }
8673 :
8674 155 : if (!list_empty_careful(&splice)) {
8675 : spin_lock(&root->delalloc_lock);
8676 : list_splice_tail(&splice, &root->delalloc_inodes);
8677 : spin_unlock(&root->delalloc_lock);
8678 : }
8679 155 : mutex_unlock(&root->delalloc_mutex);
8680 155 : return ret;
8681 : }
8682 :
8683 146 : int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8684 : {
8685 : int ret;
8686 :
8687 292 : if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
8688 : return -EROFS;
8689 :
8690 146 : ret = __start_delalloc_inodes(root, delay_iput, -1);
8691 146 : if (ret > 0)
8692 : ret = 0;
8693 : /*
8694 : * the filemap_flush will queue IO into the worker threads, but
8695 : * we have to make sure the IO is actually started and that
8696 : * ordered extents get created before we return
8697 : */
8698 146 : atomic_inc(&root->fs_info->async_submit_draining);
8699 454 : while (atomic_read(&root->fs_info->nr_async_submits) ||
8700 : atomic_read(&root->fs_info->async_delalloc_pages)) {
8701 49 : wait_event(root->fs_info->async_submit_wait,
8702 : (atomic_read(&root->fs_info->nr_async_submits) == 0 &&
8703 : atomic_read(&root->fs_info->async_delalloc_pages) == 0));
8704 : }
8705 146 : atomic_dec(&root->fs_info->async_submit_draining);
8706 146 : return ret;
8707 : }
8708 :
8709 90 : int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
8710 : int nr)
8711 : {
8712 : struct btrfs_root *root;
8713 : struct list_head splice;
8714 : int ret;
8715 :
8716 90 : if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
8717 : return -EROFS;
8718 :
8719 : INIT_LIST_HEAD(&splice);
8720 :
8721 90 : mutex_lock(&fs_info->delalloc_root_mutex);
8722 : spin_lock(&fs_info->delalloc_root_lock);
8723 90 : list_splice_init(&fs_info->delalloc_roots, &splice);
8724 99 : while (!list_empty(&splice) && nr) {
8725 9 : root = list_first_entry(&splice, struct btrfs_root,
8726 : delalloc_root);
8727 9 : root = btrfs_grab_fs_root(root);
8728 9 : BUG_ON(!root);
8729 9 : list_move_tail(&root->delalloc_root,
8730 : &fs_info->delalloc_roots);
8731 : spin_unlock(&fs_info->delalloc_root_lock);
8732 :
8733 9 : ret = __start_delalloc_inodes(root, delay_iput, nr);
8734 9 : btrfs_put_fs_root(root);
8735 9 : if (ret < 0)
8736 : goto out;
8737 :
8738 9 : if (nr != -1) {
8739 0 : nr -= ret;
8740 0 : WARN_ON(nr < 0);
8741 : }
8742 : spin_lock(&fs_info->delalloc_root_lock);
8743 : }
8744 : spin_unlock(&fs_info->delalloc_root_lock);
8745 :
8746 : ret = 0;
8747 90 : atomic_inc(&fs_info->async_submit_draining);
8748 187 : while (atomic_read(&fs_info->nr_async_submits) ||
8749 : atomic_read(&fs_info->async_delalloc_pages)) {
8750 499 : wait_event(fs_info->async_submit_wait,
8751 : (atomic_read(&fs_info->nr_async_submits) == 0 &&
8752 : atomic_read(&fs_info->async_delalloc_pages) == 0));
8753 : }
8754 : atomic_dec(&fs_info->async_submit_draining);
8755 : out:
8756 90 : if (!list_empty_careful(&splice)) {
8757 : spin_lock(&fs_info->delalloc_root_lock);
8758 : list_splice_tail(&splice, &fs_info->delalloc_roots);
8759 : spin_unlock(&fs_info->delalloc_root_lock);
8760 : }
8761 90 : mutex_unlock(&fs_info->delalloc_root_mutex);
8762 90 : return ret;
8763 : }
8764 :
8765 4219 : static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
8766 : const char *symname)
8767 : {
8768 : struct btrfs_trans_handle *trans;
8769 2172 : struct btrfs_root *root = BTRFS_I(dir)->root;
8770 : struct btrfs_path *path;
8771 : struct btrfs_key key;
8772 : struct inode *inode = NULL;
8773 : int err;
8774 : int drop_inode = 0;
8775 : u64 objectid;
8776 2172 : u64 index = 0;
8777 : int name_len;
8778 : int datasize;
8779 : unsigned long ptr;
8780 : struct btrfs_file_extent_item *ei;
8781 : struct extent_buffer *leaf;
8782 :
8783 2172 : name_len = strlen(symname);
8784 2172 : if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
8785 : return -ENAMETOOLONG;
8786 :
8787 : /*
8788 : * 2 items for inode item and ref
8789 : * 2 items for dir items
8790 : * 1 item for xattr if selinux is on
8791 : */
8792 2047 : trans = btrfs_start_transaction(root, 5);
8793 2047 : if (IS_ERR(trans))
8794 0 : return PTR_ERR(trans);
8795 :
8796 2047 : err = btrfs_find_free_ino(root, &objectid);
8797 2047 : if (err)
8798 : goto out_unlock;
8799 :
8800 4094 : inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
8801 2047 : dentry->d_name.len, btrfs_ino(dir), objectid,
8802 : S_IFLNK|S_IRWXUGO, &index);
8803 2047 : if (IS_ERR(inode)) {
8804 0 : err = PTR_ERR(inode);
8805 0 : goto out_unlock;
8806 : }
8807 :
8808 : /*
8809 : * If the active LSM wants to access the inode during
8810 : * d_instantiate it needs these. Smack checks to see
8811 : * if the filesystem supports xattrs by looking at the
8812 : * ops vector.
8813 : */
8814 2047 : inode->i_fop = &btrfs_file_operations;
8815 2047 : inode->i_op = &btrfs_file_inode_operations;
8816 2047 : inode->i_mapping->a_ops = &btrfs_aops;
8817 2047 : inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
8818 2047 : BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
8819 :
8820 2047 : err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
8821 2047 : if (err)
8822 : goto out_unlock_inode;
8823 :
8824 2047 : err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
8825 2047 : if (err)
8826 : goto out_unlock_inode;
8827 :
8828 2047 : path = btrfs_alloc_path();
8829 2047 : if (!path) {
8830 : err = -ENOMEM;
8831 : goto out_unlock_inode;
8832 : }
8833 2047 : key.objectid = btrfs_ino(inode);
8834 2047 : key.offset = 0;
8835 : btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
8836 2047 : datasize = btrfs_file_extent_calc_inline_size(name_len);
8837 : err = btrfs_insert_empty_item(trans, root, path, &key,
8838 : datasize);
8839 2047 : if (err) {
8840 0 : btrfs_free_path(path);
8841 0 : goto out_unlock_inode;
8842 : }
8843 2047 : leaf = path->nodes[0];
8844 4094 : ei = btrfs_item_ptr(leaf, path->slots[0],
8845 : struct btrfs_file_extent_item);
8846 2047 : btrfs_set_file_extent_generation(leaf, ei, trans->transid);
8847 : btrfs_set_file_extent_type(leaf, ei,
8848 : BTRFS_FILE_EXTENT_INLINE);
8849 : btrfs_set_file_extent_encryption(leaf, ei, 0);
8850 : btrfs_set_file_extent_compression(leaf, ei, 0);
8851 : btrfs_set_file_extent_other_encoding(leaf, ei, 0);
8852 : btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
8853 :
8854 : ptr = btrfs_file_extent_inline_start(ei);
8855 2047 : write_extent_buffer(leaf, symname, ptr, name_len);
8856 2047 : btrfs_mark_buffer_dirty(leaf);
8857 2047 : btrfs_free_path(path);
8858 :
8859 2047 : inode->i_op = &btrfs_symlink_inode_operations;
8860 2047 : inode->i_mapping->a_ops = &btrfs_symlink_aops;
8861 2047 : inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
8862 2047 : inode_set_bytes(inode, name_len);
8863 : btrfs_i_size_write(inode, name_len);
8864 2047 : err = btrfs_update_inode(trans, root, inode);
8865 2047 : if (err) {
8866 : drop_inode = 1;
8867 : goto out_unlock_inode;
8868 : }
8869 :
8870 2047 : unlock_new_inode(inode);
8871 2047 : d_instantiate(dentry, inode);
8872 :
8873 : out_unlock:
8874 2047 : btrfs_end_transaction(trans, root);
8875 2047 : if (drop_inode) {
8876 : inode_dec_link_count(inode);
8877 0 : iput(inode);
8878 : }
8879 2047 : btrfs_btree_balance_dirty(root);
8880 2047 : return err;
8881 :
8882 : out_unlock_inode:
8883 : drop_inode = 1;
8884 0 : unlock_new_inode(inode);
8885 0 : goto out_unlock;
8886 : }
8887 :
8888 7582 : static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
8889 : u64 start, u64 num_bytes, u64 min_size,
8890 : loff_t actual_len, u64 *alloc_hint,
8891 : struct btrfs_trans_handle *trans)
8892 : {
8893 7582 : struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
8894 : struct extent_map *em;
8895 7582 : struct btrfs_root *root = BTRFS_I(inode)->root;
8896 : struct btrfs_key ins;
8897 : u64 cur_offset = start;
8898 : u64 i_size;
8899 : u64 cur_bytes;
8900 : int ret = 0;
8901 : bool own_trans = true;
8902 :
8903 7582 : if (trans)
8904 : own_trans = false;
8905 15164 : while (num_bytes > 0) {
8906 7582 : if (own_trans) {
8907 3607 : trans = btrfs_start_transaction(root, 3);
8908 3607 : if (IS_ERR(trans)) {
8909 0 : ret = PTR_ERR(trans);
8910 0 : break;
8911 : }
8912 : }
8913 :
8914 7582 : cur_bytes = min(num_bytes, 256ULL * 1024 * 1024);
8915 7582 : cur_bytes = max(cur_bytes, min_size);
8916 7582 : ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0,
8917 : *alloc_hint, &ins, 1, 0);
8918 7582 : if (ret) {
8919 0 : if (own_trans)
8920 0 : btrfs_end_transaction(trans, root);
8921 : break;
8922 : }
8923 :
8924 7582 : ret = insert_reserved_file_extent(trans, inode,
8925 : cur_offset, ins.objectid,
8926 : ins.offset, ins.offset,
8927 : ins.offset, 0, 0, 0,
8928 : BTRFS_FILE_EXTENT_PREALLOC);
8929 7582 : if (ret) {
8930 0 : btrfs_free_reserved_extent(root, ins.objectid,
8931 : ins.offset, 0);
8932 0 : btrfs_abort_transaction(trans, root, ret);
8933 0 : if (own_trans)
8934 0 : btrfs_end_transaction(trans, root);
8935 : break;
8936 : }
8937 7582 : btrfs_drop_extent_cache(inode, cur_offset,
8938 7582 : cur_offset + ins.offset -1, 0);
8939 :
8940 7582 : em = alloc_extent_map();
8941 7582 : if (!em) {
8942 : set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
8943 : &BTRFS_I(inode)->runtime_flags);
8944 : goto next;
8945 : }
8946 :
8947 7582 : em->start = cur_offset;
8948 7582 : em->orig_start = cur_offset;
8949 7582 : em->len = ins.offset;
8950 7582 : em->block_start = ins.objectid;
8951 7582 : em->block_len = ins.offset;
8952 7582 : em->orig_block_len = ins.offset;
8953 7582 : em->ram_bytes = ins.offset;
8954 7582 : em->bdev = root->fs_info->fs_devices->latest_bdev;
8955 : set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
8956 7582 : em->generation = trans->transid;
8957 :
8958 : while (1) {
8959 7582 : write_lock(&em_tree->lock);
8960 7582 : ret = add_extent_mapping(em_tree, em, 1);
8961 : write_unlock(&em_tree->lock);
8962 7582 : if (ret != -EEXIST)
8963 : break;
8964 0 : btrfs_drop_extent_cache(inode, cur_offset,
8965 0 : cur_offset + ins.offset - 1,
8966 : 0);
8967 0 : }
8968 7582 : free_extent_map(em);
8969 : next:
8970 7582 : num_bytes -= ins.offset;
8971 7582 : cur_offset += ins.offset;
8972 7582 : *alloc_hint = ins.objectid + ins.offset;
8973 :
8974 : inode_inc_iversion(inode);
8975 7582 : inode->i_ctime = CURRENT_TIME;
8976 7582 : BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
8977 13012 : if (!(mode & FALLOC_FL_KEEP_SIZE) &&
8978 10795 : (actual_len > inode->i_size) &&
8979 5365 : (cur_offset > inode->i_size)) {
8980 5309 : if (cur_offset > actual_len)
8981 : i_size = actual_len;
8982 : else
8983 : i_size = cur_offset;
8984 5309 : i_size_write(inode, i_size);
8985 5309 : btrfs_ordered_update_i_size(inode, i_size, NULL);
8986 : }
8987 :
8988 7582 : ret = btrfs_update_inode(trans, root, inode);
8989 :
8990 7582 : if (ret) {
8991 0 : btrfs_abort_transaction(trans, root, ret);
8992 0 : if (own_trans)
8993 0 : btrfs_end_transaction(trans, root);
8994 : break;
8995 : }
8996 :
8997 7582 : if (own_trans)
8998 3607 : btrfs_end_transaction(trans, root);
8999 : }
9000 7582 : return ret;
9001 : }
9002 :
9003 3607 : int btrfs_prealloc_file_range(struct inode *inode, int mode,
9004 : u64 start, u64 num_bytes, u64 min_size,
9005 : loff_t actual_len, u64 *alloc_hint)
9006 : {
9007 3607 : return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
9008 : min_size, actual_len, alloc_hint,
9009 : NULL);
9010 : }
9011 :
9012 3975 : int btrfs_prealloc_file_range_trans(struct inode *inode,
9013 : struct btrfs_trans_handle *trans, int mode,
9014 : u64 start, u64 num_bytes, u64 min_size,
9015 : loff_t actual_len, u64 *alloc_hint)
9016 : {
9017 3975 : return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
9018 : min_size, actual_len, alloc_hint, trans);
9019 : }
9020 :
9021 1363190 : static int btrfs_set_page_dirty(struct page *page)
9022 : {
9023 1363190 : return __set_page_dirty_nobuffers(page);
9024 : }
9025 :
9026 6492852 : static int btrfs_permission(struct inode *inode, int mask)
9027 : {
9028 6578392 : struct btrfs_root *root = BTRFS_I(inode)->root;
9029 6492852 : umode_t mode = inode->i_mode;
9030 :
9031 6578696 : if (mask & MAY_WRITE &&
9032 86147 : (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
9033 85540 : if (btrfs_root_readonly(root))
9034 : return -EROFS;
9035 85539 : if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
9036 : return -EACCES;
9037 : }
9038 6492848 : return generic_permission(inode, mask);
9039 : }
9040 :
9041 2 : static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
9042 : {
9043 : struct btrfs_trans_handle *trans;
9044 2 : struct btrfs_root *root = BTRFS_I(dir)->root;
9045 : struct inode *inode = NULL;
9046 : u64 objectid;
9047 : u64 index;
9048 : int ret = 0;
9049 :
9050 : /*
9051 : * 5 units required for adding orphan entry
9052 : */
9053 2 : trans = btrfs_start_transaction(root, 5);
9054 2 : if (IS_ERR(trans))
9055 0 : return PTR_ERR(trans);
9056 :
9057 2 : ret = btrfs_find_free_ino(root, &objectid);
9058 2 : if (ret)
9059 : goto out;
9060 :
9061 4 : inode = btrfs_new_inode(trans, root, dir, NULL, 0,
9062 : btrfs_ino(dir), objectid, mode, &index);
9063 2 : if (IS_ERR(inode)) {
9064 0 : ret = PTR_ERR(inode);
9065 : inode = NULL;
9066 0 : goto out;
9067 : }
9068 :
9069 2 : inode->i_fop = &btrfs_file_operations;
9070 2 : inode->i_op = &btrfs_file_inode_operations;
9071 :
9072 2 : inode->i_mapping->a_ops = &btrfs_aops;
9073 2 : inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
9074 2 : BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
9075 :
9076 2 : ret = btrfs_init_inode_security(trans, inode, dir, NULL);
9077 2 : if (ret)
9078 : goto out_inode;
9079 :
9080 2 : ret = btrfs_update_inode(trans, root, inode);
9081 2 : if (ret)
9082 : goto out_inode;
9083 2 : ret = btrfs_orphan_add(trans, inode);
9084 2 : if (ret)
9085 : goto out_inode;
9086 :
9087 : /*
9088 : * We set number of links to 0 in btrfs_new_inode(), and here we set
9089 : * it to 1 because d_tmpfile() will issue a warning if the count is 0,
9090 : * through:
9091 : *
9092 : * d_tmpfile() -> inode_dec_link_count() -> drop_nlink()
9093 : */
9094 2 : set_nlink(inode, 1);
9095 2 : unlock_new_inode(inode);
9096 2 : d_tmpfile(dentry, inode);
9097 : mark_inode_dirty(inode);
9098 :
9099 : out:
9100 2 : btrfs_end_transaction(trans, root);
9101 2 : if (ret)
9102 0 : iput(inode);
9103 2 : btrfs_balance_delayed_items(root);
9104 2 : btrfs_btree_balance_dirty(root);
9105 2 : return ret;
9106 :
9107 : out_inode:
9108 0 : unlock_new_inode(inode);
9109 0 : goto out;
9110 :
9111 : }
9112 :
9113 : static const struct inode_operations btrfs_dir_inode_operations = {
9114 : .getattr = btrfs_getattr,
9115 : .lookup = btrfs_lookup,
9116 : .create = btrfs_create,
9117 : .unlink = btrfs_unlink,
9118 : .link = btrfs_link,
9119 : .mkdir = btrfs_mkdir,
9120 : .rmdir = btrfs_rmdir,
9121 : .rename2 = btrfs_rename2,
9122 : .symlink = btrfs_symlink,
9123 : .setattr = btrfs_setattr,
9124 : .mknod = btrfs_mknod,
9125 : .setxattr = btrfs_setxattr,
9126 : .getxattr = btrfs_getxattr,
9127 : .listxattr = btrfs_listxattr,
9128 : .removexattr = btrfs_removexattr,
9129 : .permission = btrfs_permission,
9130 : .get_acl = btrfs_get_acl,
9131 : .set_acl = btrfs_set_acl,
9132 : .update_time = btrfs_update_time,
9133 : .tmpfile = btrfs_tmpfile,
9134 : };
9135 : static const struct inode_operations btrfs_dir_ro_inode_operations = {
9136 : .lookup = btrfs_lookup,
9137 : .permission = btrfs_permission,
9138 : .get_acl = btrfs_get_acl,
9139 : .set_acl = btrfs_set_acl,
9140 : .update_time = btrfs_update_time,
9141 : };
9142 :
9143 : static const struct file_operations btrfs_dir_file_operations = {
9144 : .llseek = generic_file_llseek,
9145 : .read = generic_read_dir,
9146 : .iterate = btrfs_real_readdir,
9147 : .unlocked_ioctl = btrfs_ioctl,
9148 : #ifdef CONFIG_COMPAT
9149 : .compat_ioctl = btrfs_ioctl,
9150 : #endif
9151 : .release = btrfs_release_file,
9152 : .fsync = btrfs_sync_file,
9153 : };
9154 :
9155 : static struct extent_io_ops btrfs_extent_io_ops = {
9156 : .fill_delalloc = run_delalloc_range,
9157 : .submit_bio_hook = btrfs_submit_bio_hook,
9158 : .merge_bio_hook = btrfs_merge_bio_hook,
9159 : .readpage_end_io_hook = btrfs_readpage_end_io_hook,
9160 : .writepage_end_io_hook = btrfs_writepage_end_io_hook,
9161 : .writepage_start_hook = btrfs_writepage_start_hook,
9162 : .set_bit_hook = btrfs_set_bit_hook,
9163 : .clear_bit_hook = btrfs_clear_bit_hook,
9164 : .merge_extent_hook = btrfs_merge_extent_hook,
9165 : .split_extent_hook = btrfs_split_extent_hook,
9166 : };
9167 :
9168 : /*
9169 : * btrfs doesn't support the bmap operation because swapfiles
9170 : * use bmap to make a mapping of extents in the file. They assume
9171 : * these extents won't change over the life of the file and they
9172 : * use the bmap result to do IO directly to the drive.
9173 : *
9174 : * the btrfs bmap call would return logical addresses that aren't
9175 : * suitable for IO and they also will change frequently as COW
9176 : * operations happen. So, swapfile + btrfs == corruption.
9177 : *
9178 : * For now we're avoiding this by dropping bmap.
9179 : */
9180 : static const struct address_space_operations btrfs_aops = {
9181 : .readpage = btrfs_readpage,
9182 : .writepage = btrfs_writepage,
9183 : .writepages = btrfs_writepages,
9184 : .readpages = btrfs_readpages,
9185 : .direct_IO = btrfs_direct_IO,
9186 : .invalidatepage = btrfs_invalidatepage,
9187 : .releasepage = btrfs_releasepage,
9188 : .set_page_dirty = btrfs_set_page_dirty,
9189 : .error_remove_page = generic_error_remove_page,
9190 : };
9191 :
9192 : static const struct address_space_operations btrfs_symlink_aops = {
9193 : .readpage = btrfs_readpage,
9194 : .writepage = btrfs_writepage,
9195 : .invalidatepage = btrfs_invalidatepage,
9196 : .releasepage = btrfs_releasepage,
9197 : };
9198 :
9199 : static const struct inode_operations btrfs_file_inode_operations = {
9200 : .getattr = btrfs_getattr,
9201 : .setattr = btrfs_setattr,
9202 : .setxattr = btrfs_setxattr,
9203 : .getxattr = btrfs_getxattr,
9204 : .listxattr = btrfs_listxattr,
9205 : .removexattr = btrfs_removexattr,
9206 : .permission = btrfs_permission,
9207 : .fiemap = btrfs_fiemap,
9208 : .get_acl = btrfs_get_acl,
9209 : .set_acl = btrfs_set_acl,
9210 : .update_time = btrfs_update_time,
9211 : };
9212 : static const struct inode_operations btrfs_special_inode_operations = {
9213 : .getattr = btrfs_getattr,
9214 : .setattr = btrfs_setattr,
9215 : .permission = btrfs_permission,
9216 : .setxattr = btrfs_setxattr,
9217 : .getxattr = btrfs_getxattr,
9218 : .listxattr = btrfs_listxattr,
9219 : .removexattr = btrfs_removexattr,
9220 : .get_acl = btrfs_get_acl,
9221 : .set_acl = btrfs_set_acl,
9222 : .update_time = btrfs_update_time,
9223 : };
9224 : static const struct inode_operations btrfs_symlink_inode_operations = {
9225 : .readlink = generic_readlink,
9226 : .follow_link = page_follow_link_light,
9227 : .put_link = page_put_link,
9228 : .getattr = btrfs_getattr,
9229 : .setattr = btrfs_setattr,
9230 : .permission = btrfs_permission,
9231 : .setxattr = btrfs_setxattr,
9232 : .getxattr = btrfs_getxattr,
9233 : .listxattr = btrfs_listxattr,
9234 : .removexattr = btrfs_removexattr,
9235 : .update_time = btrfs_update_time,
9236 : };
9237 :
9238 : const struct dentry_operations btrfs_dentry_operations = {
9239 : .d_delete = btrfs_dentry_delete,
9240 : .d_release = btrfs_dentry_release,
9241 : };
|