1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (C) 2012 Fusion-io All rights reserved.
4 * Copyright (C) 2012 Intel Corp. All rights reserved.
5 */
6
7 #include <linux/sched.h>
8 #include <linux/bio.h>
9 #include <linux/slab.h>
10 #include <linux/blkdev.h>
11 #include <linux/raid/pq.h>
12 #include <linux/hash.h>
13 #include <linux/list_sort.h>
14 #include <linux/raid/xor.h>
15 #include <linux/mm.h>
16 #include "misc.h"
17 #include "ctree.h"
18 #include "disk-io.h"
19 #include "volumes.h"
20 #include "raid56.h"
21 #include "async-thread.h"
22
23 /* set when additional merges to this rbio are not allowed */
24 #define RBIO_RMW_LOCKED_BIT 1
25
26 /*
27 * set when this rbio is sitting in the hash, but it is just a cache
28 * of past RMW
29 */
30 #define RBIO_CACHE_BIT 2
31
32 /*
33 * set when it is safe to trust the stripe_pages for caching
34 */
35 #define RBIO_CACHE_READY_BIT 3
36
37 #define RBIO_CACHE_SIZE 1024
38
39 #define BTRFS_STRIPE_HASH_TABLE_BITS 11
40
41 /* Used by the raid56 code to lock stripes for read/modify/write */
42 struct btrfs_stripe_hash {
43 struct list_head hash_list;
44 spinlock_t lock;
45 };
46
47 /* Used by the raid56 code to lock stripes for read/modify/write */
48 struct btrfs_stripe_hash_table {
49 struct list_head stripe_cache;
50 spinlock_t cache_lock;
51 int cache_size;
52 struct btrfs_stripe_hash table[];
53 };
54
55 enum btrfs_rbio_ops {
56 BTRFS_RBIO_WRITE,
57 BTRFS_RBIO_READ_REBUILD,
58 BTRFS_RBIO_PARITY_SCRUB,
59 BTRFS_RBIO_REBUILD_MISSING,
60 };
61
62 struct btrfs_raid_bio {
63 struct btrfs_io_context *bioc;
64
65 /* while we're doing rmw on a stripe
66 * we put it into a hash table so we can
67 * lock the stripe and merge more rbios
68 * into it.
69 */
70 struct list_head hash_list;
71
72 /*
73 * LRU list for the stripe cache
74 */
75 struct list_head stripe_cache;
76
77 /*
78 * for scheduling work in the helper threads
79 */
80 struct btrfs_work work;
81
82 /*
83 * bio list and bio_list_lock are used
84 * to add more bios into the stripe
85 * in hopes of avoiding the full rmw
86 */
87 struct bio_list bio_list;
88 spinlock_t bio_list_lock;
89
90 /* also protected by the bio_list_lock, the
91 * plug list is used by the plugging code
92 * to collect partial bios while plugged. The
93 * stripe locking code also uses it to hand off
94 * the stripe lock to the next pending IO
95 */
96 struct list_head plug_list;
97
98 /*
99 * flags that tell us if it is safe to
100 * merge with this bio
101 */
102 unsigned long flags;
103
104 /* size of each individual stripe on disk */
105 int stripe_len;
106
107 /* number of data stripes (no p/q) */
108 int nr_data;
109
110 int real_stripes;
111
112 int stripe_npages;
113 /*
114 * set if we're doing a parity rebuild
115 * for a read from higher up, which is handled
116 * differently from a parity rebuild as part of
117 * rmw
118 */
119 enum btrfs_rbio_ops operation;
120
121 /* first bad stripe */
122 int faila;
123
124 /* second bad stripe (for raid6 use) */
125 int failb;
126
127 int scrubp;
128 /*
129 * number of pages needed to represent the full
130 * stripe
131 */
132 int nr_pages;
133
134 /*
135 * size of all the bios in the bio_list. This
136 * helps us decide if the rbio maps to a full
137 * stripe or not
138 */
139 int bio_list_bytes;
140
141 int generic_bio_cnt;
142
143 refcount_t refs;
144
145 atomic_t stripes_pending;
146
147 atomic_t error;
148 /*
149 * these are two arrays of pointers. We allocate the
150 * rbio big enough to hold them both and setup their
151 * locations when the rbio is allocated
152 */
153
154 /* pointers to pages that we allocated for
155 * reading/writing stripes directly from the disk (including P/Q)
156 */
157 struct page **stripe_pages;
158
159 /*
160 * pointers to the pages in the bio_list. Stored
161 * here for faster lookup
162 */
163 struct page **bio_pages;
164
165 /*
166 * bitmap to record which horizontal stripe has data
167 */
168 unsigned long *dbitmap;
169
170 /* allocated with real_stripes-many pointers for finish_*() calls */
171 void **finish_pointers;
172
173 /* allocated with stripe_npages-many bits for finish_*() calls */
174 unsigned long *finish_pbitmap;
175 };
176
177 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
178 static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
179 static void rmw_work(struct btrfs_work *work);
180 static void read_rebuild_work(struct btrfs_work *work);
181 static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
182 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
183 static void __free_raid_bio(struct btrfs_raid_bio *rbio);
184 static void index_rbio_pages(struct btrfs_raid_bio *rbio);
185 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
186
187 static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
188 int need_check);
189 static void scrub_parity_work(struct btrfs_work *work);
190
start_async_work(struct btrfs_raid_bio * rbio,btrfs_func_t work_func)191 static void start_async_work(struct btrfs_raid_bio *rbio, btrfs_func_t work_func)
192 {
193 btrfs_init_work(&rbio->work, work_func, NULL, NULL);
194 btrfs_queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work);
195 }
196
197 /*
198 * the stripe hash table is used for locking, and to collect
199 * bios in hopes of making a full stripe
200 */
btrfs_alloc_stripe_hash_table(struct btrfs_fs_info * info)201 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
202 {
203 struct btrfs_stripe_hash_table *table;
204 struct btrfs_stripe_hash_table *x;
205 struct btrfs_stripe_hash *cur;
206 struct btrfs_stripe_hash *h;
207 int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
208 int i;
209
210 if (info->stripe_hash_table)
211 return 0;
212
213 /*
214 * The table is large, starting with order 4 and can go as high as
215 * order 7 in case lock debugging is turned on.
216 *
217 * Try harder to allocate and fallback to vmalloc to lower the chance
218 * of a failing mount.
219 */
220 table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL);
221 if (!table)
222 return -ENOMEM;
223
224 spin_lock_init(&table->cache_lock);
225 INIT_LIST_HEAD(&table->stripe_cache);
226
227 h = table->table;
228
229 for (i = 0; i < num_entries; i++) {
230 cur = h + i;
231 INIT_LIST_HEAD(&cur->hash_list);
232 spin_lock_init(&cur->lock);
233 }
234
235 x = cmpxchg(&info->stripe_hash_table, NULL, table);
236 kvfree(x);
237 return 0;
238 }
239
240 /*
241 * caching an rbio means to copy anything from the
242 * bio_pages array into the stripe_pages array. We
243 * use the page uptodate bit in the stripe cache array
244 * to indicate if it has valid data
245 *
246 * once the caching is done, we set the cache ready
247 * bit.
248 */
cache_rbio_pages(struct btrfs_raid_bio * rbio)249 static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
250 {
251 int i;
252 int ret;
253
254 ret = alloc_rbio_pages(rbio);
255 if (ret)
256 return;
257
258 for (i = 0; i < rbio->nr_pages; i++) {
259 if (!rbio->bio_pages[i])
260 continue;
261
262 copy_highpage(rbio->stripe_pages[i], rbio->bio_pages[i]);
263 SetPageUptodate(rbio->stripe_pages[i]);
264 }
265 set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
266 }
267
268 /*
269 * we hash on the first logical address of the stripe
270 */
rbio_bucket(struct btrfs_raid_bio * rbio)271 static int rbio_bucket(struct btrfs_raid_bio *rbio)
272 {
273 u64 num = rbio->bioc->raid_map[0];
274
275 /*
276 * we shift down quite a bit. We're using byte
277 * addressing, and most of the lower bits are zeros.
278 * This tends to upset hash_64, and it consistently
279 * returns just one or two different values.
280 *
281 * shifting off the lower bits fixes things.
282 */
283 return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
284 }
285
286 /*
287 * stealing an rbio means taking all the uptodate pages from the stripe
288 * array in the source rbio and putting them into the destination rbio
289 */
steal_rbio(struct btrfs_raid_bio * src,struct btrfs_raid_bio * dest)290 static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
291 {
292 int i;
293 struct page *s;
294 struct page *d;
295
296 if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
297 return;
298
299 for (i = 0; i < dest->nr_pages; i++) {
300 s = src->stripe_pages[i];
301 if (!s || !PageUptodate(s)) {
302 continue;
303 }
304
305 d = dest->stripe_pages[i];
306 if (d)
307 __free_page(d);
308
309 dest->stripe_pages[i] = s;
310 src->stripe_pages[i] = NULL;
311 }
312 }
313
314 /*
315 * merging means we take the bio_list from the victim and
316 * splice it into the destination. The victim should
317 * be discarded afterwards.
318 *
319 * must be called with dest->rbio_list_lock held
320 */
merge_rbio(struct btrfs_raid_bio * dest,struct btrfs_raid_bio * victim)321 static void merge_rbio(struct btrfs_raid_bio *dest,
322 struct btrfs_raid_bio *victim)
323 {
324 bio_list_merge(&dest->bio_list, &victim->bio_list);
325 dest->bio_list_bytes += victim->bio_list_bytes;
326 dest->generic_bio_cnt += victim->generic_bio_cnt;
327 bio_list_init(&victim->bio_list);
328 }
329
330 /*
331 * used to prune items that are in the cache. The caller
332 * must hold the hash table lock.
333 */
__remove_rbio_from_cache(struct btrfs_raid_bio * rbio)334 static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
335 {
336 int bucket = rbio_bucket(rbio);
337 struct btrfs_stripe_hash_table *table;
338 struct btrfs_stripe_hash *h;
339 int freeit = 0;
340
341 /*
342 * check the bit again under the hash table lock.
343 */
344 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
345 return;
346
347 table = rbio->bioc->fs_info->stripe_hash_table;
348 h = table->table + bucket;
349
350 /* hold the lock for the bucket because we may be
351 * removing it from the hash table
352 */
353 spin_lock(&h->lock);
354
355 /*
356 * hold the lock for the bio list because we need
357 * to make sure the bio list is empty
358 */
359 spin_lock(&rbio->bio_list_lock);
360
361 if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
362 list_del_init(&rbio->stripe_cache);
363 table->cache_size -= 1;
364 freeit = 1;
365
366 /* if the bio list isn't empty, this rbio is
367 * still involved in an IO. We take it out
368 * of the cache list, and drop the ref that
369 * was held for the list.
370 *
371 * If the bio_list was empty, we also remove
372 * the rbio from the hash_table, and drop
373 * the corresponding ref
374 */
375 if (bio_list_empty(&rbio->bio_list)) {
376 if (!list_empty(&rbio->hash_list)) {
377 list_del_init(&rbio->hash_list);
378 refcount_dec(&rbio->refs);
379 BUG_ON(!list_empty(&rbio->plug_list));
380 }
381 }
382 }
383
384 spin_unlock(&rbio->bio_list_lock);
385 spin_unlock(&h->lock);
386
387 if (freeit)
388 __free_raid_bio(rbio);
389 }
390
391 /*
392 * prune a given rbio from the cache
393 */
remove_rbio_from_cache(struct btrfs_raid_bio * rbio)394 static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
395 {
396 struct btrfs_stripe_hash_table *table;
397 unsigned long flags;
398
399 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
400 return;
401
402 table = rbio->bioc->fs_info->stripe_hash_table;
403
404 spin_lock_irqsave(&table->cache_lock, flags);
405 __remove_rbio_from_cache(rbio);
406 spin_unlock_irqrestore(&table->cache_lock, flags);
407 }
408
409 /*
410 * remove everything in the cache
411 */
btrfs_clear_rbio_cache(struct btrfs_fs_info * info)412 static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
413 {
414 struct btrfs_stripe_hash_table *table;
415 unsigned long flags;
416 struct btrfs_raid_bio *rbio;
417
418 table = info->stripe_hash_table;
419
420 spin_lock_irqsave(&table->cache_lock, flags);
421 while (!list_empty(&table->stripe_cache)) {
422 rbio = list_entry(table->stripe_cache.next,
423 struct btrfs_raid_bio,
424 stripe_cache);
425 __remove_rbio_from_cache(rbio);
426 }
427 spin_unlock_irqrestore(&table->cache_lock, flags);
428 }
429
430 /*
431 * remove all cached entries and free the hash table
432 * used by unmount
433 */
btrfs_free_stripe_hash_table(struct btrfs_fs_info * info)434 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
435 {
436 if (!info->stripe_hash_table)
437 return;
438 btrfs_clear_rbio_cache(info);
439 kvfree(info->stripe_hash_table);
440 info->stripe_hash_table = NULL;
441 }
442
443 /*
444 * insert an rbio into the stripe cache. It
445 * must have already been prepared by calling
446 * cache_rbio_pages
447 *
448 * If this rbio was already cached, it gets
449 * moved to the front of the lru.
450 *
451 * If the size of the rbio cache is too big, we
452 * prune an item.
453 */
cache_rbio(struct btrfs_raid_bio * rbio)454 static void cache_rbio(struct btrfs_raid_bio *rbio)
455 {
456 struct btrfs_stripe_hash_table *table;
457 unsigned long flags;
458
459 if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
460 return;
461
462 table = rbio->bioc->fs_info->stripe_hash_table;
463
464 spin_lock_irqsave(&table->cache_lock, flags);
465 spin_lock(&rbio->bio_list_lock);
466
467 /* bump our ref if we were not in the list before */
468 if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
469 refcount_inc(&rbio->refs);
470
471 if (!list_empty(&rbio->stripe_cache)){
472 list_move(&rbio->stripe_cache, &table->stripe_cache);
473 } else {
474 list_add(&rbio->stripe_cache, &table->stripe_cache);
475 table->cache_size += 1;
476 }
477
478 spin_unlock(&rbio->bio_list_lock);
479
480 if (table->cache_size > RBIO_CACHE_SIZE) {
481 struct btrfs_raid_bio *found;
482
483 found = list_entry(table->stripe_cache.prev,
484 struct btrfs_raid_bio,
485 stripe_cache);
486
487 if (found != rbio)
488 __remove_rbio_from_cache(found);
489 }
490
491 spin_unlock_irqrestore(&table->cache_lock, flags);
492 }
493
494 /*
495 * helper function to run the xor_blocks api. It is only
496 * able to do MAX_XOR_BLOCKS at a time, so we need to
497 * loop through.
498 */
run_xor(void ** pages,int src_cnt,ssize_t len)499 static void run_xor(void **pages, int src_cnt, ssize_t len)
500 {
501 int src_off = 0;
502 int xor_src_cnt = 0;
503 void *dest = pages[src_cnt];
504
505 while(src_cnt > 0) {
506 xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
507 xor_blocks(xor_src_cnt, len, dest, pages + src_off);
508
509 src_cnt -= xor_src_cnt;
510 src_off += xor_src_cnt;
511 }
512 }
513
514 /*
515 * Returns true if the bio list inside this rbio covers an entire stripe (no
516 * rmw required).
517 */
rbio_is_full(struct btrfs_raid_bio * rbio)518 static int rbio_is_full(struct btrfs_raid_bio *rbio)
519 {
520 unsigned long flags;
521 unsigned long size = rbio->bio_list_bytes;
522 int ret = 1;
523
524 spin_lock_irqsave(&rbio->bio_list_lock, flags);
525 if (size != rbio->nr_data * rbio->stripe_len)
526 ret = 0;
527 BUG_ON(size > rbio->nr_data * rbio->stripe_len);
528 spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
529
530 return ret;
531 }
532
533 /*
534 * returns 1 if it is safe to merge two rbios together.
535 * The merging is safe if the two rbios correspond to
536 * the same stripe and if they are both going in the same
537 * direction (read vs write), and if neither one is
538 * locked for final IO
539 *
540 * The caller is responsible for locking such that
541 * rmw_locked is safe to test
542 */
rbio_can_merge(struct btrfs_raid_bio * last,struct btrfs_raid_bio * cur)543 static int rbio_can_merge(struct btrfs_raid_bio *last,
544 struct btrfs_raid_bio *cur)
545 {
546 if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
547 test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
548 return 0;
549
550 /*
551 * we can't merge with cached rbios, since the
552 * idea is that when we merge the destination
553 * rbio is going to run our IO for us. We can
554 * steal from cached rbios though, other functions
555 * handle that.
556 */
557 if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
558 test_bit(RBIO_CACHE_BIT, &cur->flags))
559 return 0;
560
561 if (last->bioc->raid_map[0] != cur->bioc->raid_map[0])
562 return 0;
563
564 /* we can't merge with different operations */
565 if (last->operation != cur->operation)
566 return 0;
567 /*
568 * We've need read the full stripe from the drive.
569 * check and repair the parity and write the new results.
570 *
571 * We're not allowed to add any new bios to the
572 * bio list here, anyone else that wants to
573 * change this stripe needs to do their own rmw.
574 */
575 if (last->operation == BTRFS_RBIO_PARITY_SCRUB)
576 return 0;
577
578 if (last->operation == BTRFS_RBIO_REBUILD_MISSING)
579 return 0;
580
581 if (last->operation == BTRFS_RBIO_READ_REBUILD) {
582 int fa = last->faila;
583 int fb = last->failb;
584 int cur_fa = cur->faila;
585 int cur_fb = cur->failb;
586
587 if (last->faila >= last->failb) {
588 fa = last->failb;
589 fb = last->faila;
590 }
591
592 if (cur->faila >= cur->failb) {
593 cur_fa = cur->failb;
594 cur_fb = cur->faila;
595 }
596
597 if (fa != cur_fa || fb != cur_fb)
598 return 0;
599 }
600 return 1;
601 }
602
rbio_stripe_page_index(struct btrfs_raid_bio * rbio,int stripe,int index)603 static int rbio_stripe_page_index(struct btrfs_raid_bio *rbio, int stripe,
604 int index)
605 {
606 return stripe * rbio->stripe_npages + index;
607 }
608
609 /*
610 * these are just the pages from the rbio array, not from anything
611 * the FS sent down to us
612 */
rbio_stripe_page(struct btrfs_raid_bio * rbio,int stripe,int index)613 static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe,
614 int index)
615 {
616 return rbio->stripe_pages[rbio_stripe_page_index(rbio, stripe, index)];
617 }
618
619 /*
620 * helper to index into the pstripe
621 */
rbio_pstripe_page(struct btrfs_raid_bio * rbio,int index)622 static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
623 {
624 return rbio_stripe_page(rbio, rbio->nr_data, index);
625 }
626
627 /*
628 * helper to index into the qstripe, returns null
629 * if there is no qstripe
630 */
rbio_qstripe_page(struct btrfs_raid_bio * rbio,int index)631 static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
632 {
633 if (rbio->nr_data + 1 == rbio->real_stripes)
634 return NULL;
635 return rbio_stripe_page(rbio, rbio->nr_data + 1, index);
636 }
637
638 /*
639 * The first stripe in the table for a logical address
640 * has the lock. rbios are added in one of three ways:
641 *
642 * 1) Nobody has the stripe locked yet. The rbio is given
643 * the lock and 0 is returned. The caller must start the IO
644 * themselves.
645 *
646 * 2) Someone has the stripe locked, but we're able to merge
647 * with the lock owner. The rbio is freed and the IO will
648 * start automatically along with the existing rbio. 1 is returned.
649 *
650 * 3) Someone has the stripe locked, but we're not able to merge.
651 * The rbio is added to the lock owner's plug list, or merged into
652 * an rbio already on the plug list. When the lock owner unlocks,
653 * the next rbio on the list is run and the IO is started automatically.
654 * 1 is returned
655 *
656 * If we return 0, the caller still owns the rbio and must continue with
657 * IO submission. If we return 1, the caller must assume the rbio has
658 * already been freed.
659 */
lock_stripe_add(struct btrfs_raid_bio * rbio)660 static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
661 {
662 struct btrfs_stripe_hash *h;
663 struct btrfs_raid_bio *cur;
664 struct btrfs_raid_bio *pending;
665 unsigned long flags;
666 struct btrfs_raid_bio *freeit = NULL;
667 struct btrfs_raid_bio *cache_drop = NULL;
668 int ret = 0;
669
670 h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
671
672 spin_lock_irqsave(&h->lock, flags);
673 list_for_each_entry(cur, &h->hash_list, hash_list) {
674 if (cur->bioc->raid_map[0] != rbio->bioc->raid_map[0])
675 continue;
676
677 spin_lock(&cur->bio_list_lock);
678
679 /* Can we steal this cached rbio's pages? */
680 if (bio_list_empty(&cur->bio_list) &&
681 list_empty(&cur->plug_list) &&
682 test_bit(RBIO_CACHE_BIT, &cur->flags) &&
683 !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
684 list_del_init(&cur->hash_list);
685 refcount_dec(&cur->refs);
686
687 steal_rbio(cur, rbio);
688 cache_drop = cur;
689 spin_unlock(&cur->bio_list_lock);
690
691 goto lockit;
692 }
693
694 /* Can we merge into the lock owner? */
695 if (rbio_can_merge(cur, rbio)) {
696 merge_rbio(cur, rbio);
697 spin_unlock(&cur->bio_list_lock);
698 freeit = rbio;
699 ret = 1;
700 goto out;
701 }
702
703
704 /*
705 * We couldn't merge with the running rbio, see if we can merge
706 * with the pending ones. We don't have to check for rmw_locked
707 * because there is no way they are inside finish_rmw right now
708 */
709 list_for_each_entry(pending, &cur->plug_list, plug_list) {
710 if (rbio_can_merge(pending, rbio)) {
711 merge_rbio(pending, rbio);
712 spin_unlock(&cur->bio_list_lock);
713 freeit = rbio;
714 ret = 1;
715 goto out;
716 }
717 }
718
719 /*
720 * No merging, put us on the tail of the plug list, our rbio
721 * will be started with the currently running rbio unlocks
722 */
723 list_add_tail(&rbio->plug_list, &cur->plug_list);
724 spin_unlock(&cur->bio_list_lock);
725 ret = 1;
726 goto out;
727 }
728 lockit:
729 refcount_inc(&rbio->refs);
730 list_add(&rbio->hash_list, &h->hash_list);
731 out:
732 spin_unlock_irqrestore(&h->lock, flags);
733 if (cache_drop)
734 remove_rbio_from_cache(cache_drop);
735 if (freeit)
736 __free_raid_bio(freeit);
737 return ret;
738 }
739
740 /*
741 * called as rmw or parity rebuild is completed. If the plug list has more
742 * rbios waiting for this stripe, the next one on the list will be started
743 */
unlock_stripe(struct btrfs_raid_bio * rbio)744 static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
745 {
746 int bucket;
747 struct btrfs_stripe_hash *h;
748 unsigned long flags;
749 int keep_cache = 0;
750
751 bucket = rbio_bucket(rbio);
752 h = rbio->bioc->fs_info->stripe_hash_table->table + bucket;
753
754 if (list_empty(&rbio->plug_list))
755 cache_rbio(rbio);
756
757 spin_lock_irqsave(&h->lock, flags);
758 spin_lock(&rbio->bio_list_lock);
759
760 if (!list_empty(&rbio->hash_list)) {
761 /*
762 * if we're still cached and there is no other IO
763 * to perform, just leave this rbio here for others
764 * to steal from later
765 */
766 if (list_empty(&rbio->plug_list) &&
767 test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
768 keep_cache = 1;
769 clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
770 BUG_ON(!bio_list_empty(&rbio->bio_list));
771 goto done;
772 }
773
774 list_del_init(&rbio->hash_list);
775 refcount_dec(&rbio->refs);
776
777 /*
778 * we use the plug list to hold all the rbios
779 * waiting for the chance to lock this stripe.
780 * hand the lock over to one of them.
781 */
782 if (!list_empty(&rbio->plug_list)) {
783 struct btrfs_raid_bio *next;
784 struct list_head *head = rbio->plug_list.next;
785
786 next = list_entry(head, struct btrfs_raid_bio,
787 plug_list);
788
789 list_del_init(&rbio->plug_list);
790
791 list_add(&next->hash_list, &h->hash_list);
792 refcount_inc(&next->refs);
793 spin_unlock(&rbio->bio_list_lock);
794 spin_unlock_irqrestore(&h->lock, flags);
795
796 if (next->operation == BTRFS_RBIO_READ_REBUILD)
797 start_async_work(next, read_rebuild_work);
798 else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) {
799 steal_rbio(rbio, next);
800 start_async_work(next, read_rebuild_work);
801 } else if (next->operation == BTRFS_RBIO_WRITE) {
802 steal_rbio(rbio, next);
803 start_async_work(next, rmw_work);
804 } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
805 steal_rbio(rbio, next);
806 start_async_work(next, scrub_parity_work);
807 }
808
809 goto done_nolock;
810 }
811 }
812 done:
813 spin_unlock(&rbio->bio_list_lock);
814 spin_unlock_irqrestore(&h->lock, flags);
815
816 done_nolock:
817 if (!keep_cache)
818 remove_rbio_from_cache(rbio);
819 }
820
__free_raid_bio(struct btrfs_raid_bio * rbio)821 static void __free_raid_bio(struct btrfs_raid_bio *rbio)
822 {
823 int i;
824
825 if (!refcount_dec_and_test(&rbio->refs))
826 return;
827
828 WARN_ON(!list_empty(&rbio->stripe_cache));
829 WARN_ON(!list_empty(&rbio->hash_list));
830 WARN_ON(!bio_list_empty(&rbio->bio_list));
831
832 for (i = 0; i < rbio->nr_pages; i++) {
833 if (rbio->stripe_pages[i]) {
834 __free_page(rbio->stripe_pages[i]);
835 rbio->stripe_pages[i] = NULL;
836 }
837 }
838
839 btrfs_put_bioc(rbio->bioc);
840 kfree(rbio);
841 }
842
rbio_endio_bio_list(struct bio * cur,blk_status_t err)843 static void rbio_endio_bio_list(struct bio *cur, blk_status_t err)
844 {
845 struct bio *next;
846
847 while (cur) {
848 next = cur->bi_next;
849 cur->bi_next = NULL;
850 cur->bi_status = err;
851 bio_endio(cur);
852 cur = next;
853 }
854 }
855
856 /*
857 * this frees the rbio and runs through all the bios in the
858 * bio_list and calls end_io on them
859 */
rbio_orig_end_io(struct btrfs_raid_bio * rbio,blk_status_t err)860 static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
861 {
862 struct bio *cur = bio_list_get(&rbio->bio_list);
863 struct bio *extra;
864
865 if (rbio->generic_bio_cnt)
866 btrfs_bio_counter_sub(rbio->bioc->fs_info, rbio->generic_bio_cnt);
867
868 /*
869 * At this moment, rbio->bio_list is empty, however since rbio does not
870 * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the
871 * hash list, rbio may be merged with others so that rbio->bio_list
872 * becomes non-empty.
873 * Once unlock_stripe() is done, rbio->bio_list will not be updated any
874 * more and we can call bio_endio() on all queued bios.
875 */
876 unlock_stripe(rbio);
877 extra = bio_list_get(&rbio->bio_list);
878 __free_raid_bio(rbio);
879
880 rbio_endio_bio_list(cur, err);
881 if (extra)
882 rbio_endio_bio_list(extra, err);
883 }
884
885 /*
886 * end io function used by finish_rmw. When we finally
887 * get here, we've written a full stripe
888 */
raid_write_end_io(struct bio * bio)889 static void raid_write_end_io(struct bio *bio)
890 {
891 struct btrfs_raid_bio *rbio = bio->bi_private;
892 blk_status_t err = bio->bi_status;
893 int max_errors;
894
895 if (err)
896 fail_bio_stripe(rbio, bio);
897
898 bio_put(bio);
899
900 if (!atomic_dec_and_test(&rbio->stripes_pending))
901 return;
902
903 err = BLK_STS_OK;
904
905 /* OK, we have read all the stripes we need to. */
906 max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ?
907 0 : rbio->bioc->max_errors;
908 if (atomic_read(&rbio->error) > max_errors)
909 err = BLK_STS_IOERR;
910
911 rbio_orig_end_io(rbio, err);
912 }
913
914 /*
915 * the read/modify/write code wants to use the original bio for
916 * any pages it included, and then use the rbio for everything
917 * else. This function decides if a given index (stripe number)
918 * and page number in that stripe fall inside the original bio
919 * or the rbio.
920 *
921 * if you set bio_list_only, you'll get a NULL back for any ranges
922 * that are outside the bio_list
923 *
924 * This doesn't take any refs on anything, you get a bare page pointer
925 * and the caller must bump refs as required.
926 *
927 * You must call index_rbio_pages once before you can trust
928 * the answers from this function.
929 */
page_in_rbio(struct btrfs_raid_bio * rbio,int index,int pagenr,int bio_list_only)930 static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
931 int index, int pagenr, int bio_list_only)
932 {
933 int chunk_page;
934 struct page *p = NULL;
935
936 chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
937
938 spin_lock_irq(&rbio->bio_list_lock);
939 p = rbio->bio_pages[chunk_page];
940 spin_unlock_irq(&rbio->bio_list_lock);
941
942 if (p || bio_list_only)
943 return p;
944
945 return rbio->stripe_pages[chunk_page];
946 }
947
948 /*
949 * number of pages we need for the entire stripe across all the
950 * drives
951 */
rbio_nr_pages(unsigned long stripe_len,int nr_stripes)952 static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
953 {
954 return DIV_ROUND_UP(stripe_len, PAGE_SIZE) * nr_stripes;
955 }
956
957 /*
958 * allocation and initial setup for the btrfs_raid_bio. Not
959 * this does not allocate any pages for rbio->pages.
960 */
alloc_rbio(struct btrfs_fs_info * fs_info,struct btrfs_io_context * bioc,u64 stripe_len)961 static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
962 struct btrfs_io_context *bioc,
963 u64 stripe_len)
964 {
965 struct btrfs_raid_bio *rbio;
966 int nr_data = 0;
967 int real_stripes = bioc->num_stripes - bioc->num_tgtdevs;
968 int num_pages = rbio_nr_pages(stripe_len, real_stripes);
969 int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE);
970 void *p;
971
972 rbio = kzalloc(sizeof(*rbio) +
973 sizeof(*rbio->stripe_pages) * num_pages +
974 sizeof(*rbio->bio_pages) * num_pages +
975 sizeof(*rbio->finish_pointers) * real_stripes +
976 sizeof(*rbio->dbitmap) * BITS_TO_LONGS(stripe_npages) +
977 sizeof(*rbio->finish_pbitmap) *
978 BITS_TO_LONGS(stripe_npages),
979 GFP_NOFS);
980 if (!rbio)
981 return ERR_PTR(-ENOMEM);
982
983 bio_list_init(&rbio->bio_list);
984 INIT_LIST_HEAD(&rbio->plug_list);
985 spin_lock_init(&rbio->bio_list_lock);
986 INIT_LIST_HEAD(&rbio->stripe_cache);
987 INIT_LIST_HEAD(&rbio->hash_list);
988 rbio->bioc = bioc;
989 rbio->stripe_len = stripe_len;
990 rbio->nr_pages = num_pages;
991 rbio->real_stripes = real_stripes;
992 rbio->stripe_npages = stripe_npages;
993 rbio->faila = -1;
994 rbio->failb = -1;
995 refcount_set(&rbio->refs, 1);
996 atomic_set(&rbio->error, 0);
997 atomic_set(&rbio->stripes_pending, 0);
998
999 /*
1000 * the stripe_pages, bio_pages, etc arrays point to the extra
1001 * memory we allocated past the end of the rbio
1002 */
1003 p = rbio + 1;
1004 #define CONSUME_ALLOC(ptr, count) do { \
1005 ptr = p; \
1006 p = (unsigned char *)p + sizeof(*(ptr)) * (count); \
1007 } while (0)
1008 CONSUME_ALLOC(rbio->stripe_pages, num_pages);
1009 CONSUME_ALLOC(rbio->bio_pages, num_pages);
1010 CONSUME_ALLOC(rbio->finish_pointers, real_stripes);
1011 CONSUME_ALLOC(rbio->dbitmap, BITS_TO_LONGS(stripe_npages));
1012 CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_npages));
1013 #undef CONSUME_ALLOC
1014
1015 if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
1016 nr_data = real_stripes - 1;
1017 else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6)
1018 nr_data = real_stripes - 2;
1019 else
1020 BUG();
1021
1022 rbio->nr_data = nr_data;
1023 return rbio;
1024 }
1025
1026 /* allocate pages for all the stripes in the bio, including parity */
alloc_rbio_pages(struct btrfs_raid_bio * rbio)1027 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
1028 {
1029 int i;
1030 struct page *page;
1031
1032 for (i = 0; i < rbio->nr_pages; i++) {
1033 if (rbio->stripe_pages[i])
1034 continue;
1035 page = alloc_page(GFP_NOFS);
1036 if (!page)
1037 return -ENOMEM;
1038 rbio->stripe_pages[i] = page;
1039 }
1040 return 0;
1041 }
1042
1043 /* only allocate pages for p/q stripes */
alloc_rbio_parity_pages(struct btrfs_raid_bio * rbio)1044 static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
1045 {
1046 int i;
1047 struct page *page;
1048
1049 i = rbio_stripe_page_index(rbio, rbio->nr_data, 0);
1050
1051 for (; i < rbio->nr_pages; i++) {
1052 if (rbio->stripe_pages[i])
1053 continue;
1054 page = alloc_page(GFP_NOFS);
1055 if (!page)
1056 return -ENOMEM;
1057 rbio->stripe_pages[i] = page;
1058 }
1059 return 0;
1060 }
1061
1062 /*
1063 * add a single page from a specific stripe into our list of bios for IO
1064 * this will try to merge into existing bios if possible, and returns
1065 * zero if all went well.
1066 */
rbio_add_io_page(struct btrfs_raid_bio * rbio,struct bio_list * bio_list,struct page * page,int stripe_nr,unsigned long page_index,unsigned long bio_max_len)1067 static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
1068 struct bio_list *bio_list,
1069 struct page *page,
1070 int stripe_nr,
1071 unsigned long page_index,
1072 unsigned long bio_max_len)
1073 {
1074 struct bio *last = bio_list->tail;
1075 int ret;
1076 struct bio *bio;
1077 struct btrfs_io_stripe *stripe;
1078 u64 disk_start;
1079
1080 stripe = &rbio->bioc->stripes[stripe_nr];
1081 disk_start = stripe->physical + (page_index << PAGE_SHIFT);
1082
1083 /* if the device is missing, just fail this stripe */
1084 if (!stripe->dev->bdev)
1085 return fail_rbio_index(rbio, stripe_nr);
1086
1087 /* see if we can add this page onto our existing bio */
1088 if (last) {
1089 u64 last_end = last->bi_iter.bi_sector << 9;
1090 last_end += last->bi_iter.bi_size;
1091
1092 /*
1093 * we can't merge these if they are from different
1094 * devices or if they are not contiguous
1095 */
1096 if (last_end == disk_start && !last->bi_status &&
1097 last->bi_bdev == stripe->dev->bdev) {
1098 ret = bio_add_page(last, page, PAGE_SIZE, 0);
1099 if (ret == PAGE_SIZE)
1100 return 0;
1101 }
1102 }
1103
1104 /* put a new bio on the list */
1105 bio = btrfs_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1);
1106 btrfs_bio(bio)->device = stripe->dev;
1107 bio->bi_iter.bi_size = 0;
1108 bio_set_dev(bio, stripe->dev->bdev);
1109 bio->bi_iter.bi_sector = disk_start >> 9;
1110
1111 bio_add_page(bio, page, PAGE_SIZE, 0);
1112 bio_list_add(bio_list, bio);
1113 return 0;
1114 }
1115
1116 /*
1117 * while we're doing the read/modify/write cycle, we could
1118 * have errors in reading pages off the disk. This checks
1119 * for errors and if we're not able to read the page it'll
1120 * trigger parity reconstruction. The rmw will be finished
1121 * after we've reconstructed the failed stripes
1122 */
validate_rbio_for_rmw(struct btrfs_raid_bio * rbio)1123 static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
1124 {
1125 if (rbio->faila >= 0 || rbio->failb >= 0) {
1126 BUG_ON(rbio->faila == rbio->real_stripes - 1);
1127 __raid56_parity_recover(rbio);
1128 } else {
1129 finish_rmw(rbio);
1130 }
1131 }
1132
1133 /*
1134 * helper function to walk our bio list and populate the bio_pages array with
1135 * the result. This seems expensive, but it is faster than constantly
1136 * searching through the bio list as we setup the IO in finish_rmw or stripe
1137 * reconstruction.
1138 *
1139 * This must be called before you trust the answers from page_in_rbio
1140 */
index_rbio_pages(struct btrfs_raid_bio * rbio)1141 static void index_rbio_pages(struct btrfs_raid_bio *rbio)
1142 {
1143 struct bio *bio;
1144 u64 start;
1145 unsigned long stripe_offset;
1146 unsigned long page_index;
1147
1148 spin_lock_irq(&rbio->bio_list_lock);
1149 bio_list_for_each(bio, &rbio->bio_list) {
1150 struct bio_vec bvec;
1151 struct bvec_iter iter;
1152 int i = 0;
1153
1154 start = bio->bi_iter.bi_sector << 9;
1155 stripe_offset = start - rbio->bioc->raid_map[0];
1156 page_index = stripe_offset >> PAGE_SHIFT;
1157
1158 if (bio_flagged(bio, BIO_CLONED))
1159 bio->bi_iter = btrfs_bio(bio)->iter;
1160
1161 bio_for_each_segment(bvec, bio, iter) {
1162 rbio->bio_pages[page_index + i] = bvec.bv_page;
1163 i++;
1164 }
1165 }
1166 spin_unlock_irq(&rbio->bio_list_lock);
1167 }
1168
1169 /*
1170 * this is called from one of two situations. We either
1171 * have a full stripe from the higher layers, or we've read all
1172 * the missing bits off disk.
1173 *
1174 * This will calculate the parity and then send down any
1175 * changed blocks.
1176 */
finish_rmw(struct btrfs_raid_bio * rbio)1177 static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1178 {
1179 struct btrfs_io_context *bioc = rbio->bioc;
1180 void **pointers = rbio->finish_pointers;
1181 int nr_data = rbio->nr_data;
1182 int stripe;
1183 int pagenr;
1184 bool has_qstripe;
1185 struct bio_list bio_list;
1186 struct bio *bio;
1187 int ret;
1188
1189 bio_list_init(&bio_list);
1190
1191 if (rbio->real_stripes - rbio->nr_data == 1)
1192 has_qstripe = false;
1193 else if (rbio->real_stripes - rbio->nr_data == 2)
1194 has_qstripe = true;
1195 else
1196 BUG();
1197
1198 /* at this point we either have a full stripe,
1199 * or we've read the full stripe from the drive.
1200 * recalculate the parity and write the new results.
1201 *
1202 * We're not allowed to add any new bios to the
1203 * bio list here, anyone else that wants to
1204 * change this stripe needs to do their own rmw.
1205 */
1206 spin_lock_irq(&rbio->bio_list_lock);
1207 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1208 spin_unlock_irq(&rbio->bio_list_lock);
1209
1210 atomic_set(&rbio->error, 0);
1211
1212 /*
1213 * now that we've set rmw_locked, run through the
1214 * bio list one last time and map the page pointers
1215 *
1216 * We don't cache full rbios because we're assuming
1217 * the higher layers are unlikely to use this area of
1218 * the disk again soon. If they do use it again,
1219 * hopefully they will send another full bio.
1220 */
1221 index_rbio_pages(rbio);
1222 if (!rbio_is_full(rbio))
1223 cache_rbio_pages(rbio);
1224 else
1225 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
1226
1227 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
1228 struct page *p;
1229 /* first collect one page from each data stripe */
1230 for (stripe = 0; stripe < nr_data; stripe++) {
1231 p = page_in_rbio(rbio, stripe, pagenr, 0);
1232 pointers[stripe] = kmap_local_page(p);
1233 }
1234
1235 /* then add the parity stripe */
1236 p = rbio_pstripe_page(rbio, pagenr);
1237 SetPageUptodate(p);
1238 pointers[stripe++] = kmap_local_page(p);
1239
1240 if (has_qstripe) {
1241
1242 /*
1243 * raid6, add the qstripe and call the
1244 * library function to fill in our p/q
1245 */
1246 p = rbio_qstripe_page(rbio, pagenr);
1247 SetPageUptodate(p);
1248 pointers[stripe++] = kmap_local_page(p);
1249
1250 raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
1251 pointers);
1252 } else {
1253 /* raid5 */
1254 copy_page(pointers[nr_data], pointers[0]);
1255 run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
1256 }
1257 for (stripe = stripe - 1; stripe >= 0; stripe--)
1258 kunmap_local(pointers[stripe]);
1259 }
1260
1261 /*
1262 * time to start writing. Make bios for everything from the
1263 * higher layers (the bio_list in our rbio) and our p/q. Ignore
1264 * everything else.
1265 */
1266 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1267 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
1268 struct page *page;
1269 if (stripe < rbio->nr_data) {
1270 page = page_in_rbio(rbio, stripe, pagenr, 1);
1271 if (!page)
1272 continue;
1273 } else {
1274 page = rbio_stripe_page(rbio, stripe, pagenr);
1275 }
1276
1277 ret = rbio_add_io_page(rbio, &bio_list,
1278 page, stripe, pagenr, rbio->stripe_len);
1279 if (ret)
1280 goto cleanup;
1281 }
1282 }
1283
1284 if (likely(!bioc->num_tgtdevs))
1285 goto write_data;
1286
1287 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1288 if (!bioc->tgtdev_map[stripe])
1289 continue;
1290
1291 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
1292 struct page *page;
1293 if (stripe < rbio->nr_data) {
1294 page = page_in_rbio(rbio, stripe, pagenr, 1);
1295 if (!page)
1296 continue;
1297 } else {
1298 page = rbio_stripe_page(rbio, stripe, pagenr);
1299 }
1300
1301 ret = rbio_add_io_page(rbio, &bio_list, page,
1302 rbio->bioc->tgtdev_map[stripe],
1303 pagenr, rbio->stripe_len);
1304 if (ret)
1305 goto cleanup;
1306 }
1307 }
1308
1309 write_data:
1310 atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
1311 BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
1312
1313 while ((bio = bio_list_pop(&bio_list))) {
1314 bio->bi_private = rbio;
1315 bio->bi_end_io = raid_write_end_io;
1316 bio->bi_opf = REQ_OP_WRITE;
1317
1318 submit_bio(bio);
1319 }
1320 return;
1321
1322 cleanup:
1323 rbio_orig_end_io(rbio, BLK_STS_IOERR);
1324
1325 while ((bio = bio_list_pop(&bio_list)))
1326 bio_put(bio);
1327 }
1328
1329 /*
1330 * helper to find the stripe number for a given bio. Used to figure out which
1331 * stripe has failed. This expects the bio to correspond to a physical disk,
1332 * so it looks up based on physical sector numbers.
1333 */
find_bio_stripe(struct btrfs_raid_bio * rbio,struct bio * bio)1334 static int find_bio_stripe(struct btrfs_raid_bio *rbio,
1335 struct bio *bio)
1336 {
1337 u64 physical = bio->bi_iter.bi_sector;
1338 int i;
1339 struct btrfs_io_stripe *stripe;
1340
1341 physical <<= 9;
1342
1343 for (i = 0; i < rbio->bioc->num_stripes; i++) {
1344 stripe = &rbio->bioc->stripes[i];
1345 if (in_range(physical, stripe->physical, rbio->stripe_len) &&
1346 stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) {
1347 return i;
1348 }
1349 }
1350 return -1;
1351 }
1352
1353 /*
1354 * helper to find the stripe number for a given
1355 * bio (before mapping). Used to figure out which stripe has
1356 * failed. This looks up based on logical block numbers.
1357 */
find_logical_bio_stripe(struct btrfs_raid_bio * rbio,struct bio * bio)1358 static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
1359 struct bio *bio)
1360 {
1361 u64 logical = bio->bi_iter.bi_sector << 9;
1362 int i;
1363
1364 for (i = 0; i < rbio->nr_data; i++) {
1365 u64 stripe_start = rbio->bioc->raid_map[i];
1366
1367 if (in_range(logical, stripe_start, rbio->stripe_len))
1368 return i;
1369 }
1370 return -1;
1371 }
1372
1373 /*
1374 * returns -EIO if we had too many failures
1375 */
fail_rbio_index(struct btrfs_raid_bio * rbio,int failed)1376 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
1377 {
1378 unsigned long flags;
1379 int ret = 0;
1380
1381 spin_lock_irqsave(&rbio->bio_list_lock, flags);
1382
1383 /* we already know this stripe is bad, move on */
1384 if (rbio->faila == failed || rbio->failb == failed)
1385 goto out;
1386
1387 if (rbio->faila == -1) {
1388 /* first failure on this rbio */
1389 rbio->faila = failed;
1390 atomic_inc(&rbio->error);
1391 } else if (rbio->failb == -1) {
1392 /* second failure on this rbio */
1393 rbio->failb = failed;
1394 atomic_inc(&rbio->error);
1395 } else {
1396 ret = -EIO;
1397 }
1398 out:
1399 spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
1400
1401 return ret;
1402 }
1403
1404 /*
1405 * helper to fail a stripe based on a physical disk
1406 * bio.
1407 */
fail_bio_stripe(struct btrfs_raid_bio * rbio,struct bio * bio)1408 static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
1409 struct bio *bio)
1410 {
1411 int failed = find_bio_stripe(rbio, bio);
1412
1413 if (failed < 0)
1414 return -EIO;
1415
1416 return fail_rbio_index(rbio, failed);
1417 }
1418
1419 /*
1420 * this sets each page in the bio uptodate. It should only be used on private
1421 * rbio pages, nothing that comes in from the higher layers
1422 */
set_bio_pages_uptodate(struct bio * bio)1423 static void set_bio_pages_uptodate(struct bio *bio)
1424 {
1425 struct bio_vec *bvec;
1426 struct bvec_iter_all iter_all;
1427
1428 ASSERT(!bio_flagged(bio, BIO_CLONED));
1429
1430 bio_for_each_segment_all(bvec, bio, iter_all)
1431 SetPageUptodate(bvec->bv_page);
1432 }
1433
1434 /*
1435 * end io for the read phase of the rmw cycle. All the bios here are physical
1436 * stripe bios we've read from the disk so we can recalculate the parity of the
1437 * stripe.
1438 *
1439 * This will usually kick off finish_rmw once all the bios are read in, but it
1440 * may trigger parity reconstruction if we had any errors along the way
1441 */
raid_rmw_end_io(struct bio * bio)1442 static void raid_rmw_end_io(struct bio *bio)
1443 {
1444 struct btrfs_raid_bio *rbio = bio->bi_private;
1445
1446 if (bio->bi_status)
1447 fail_bio_stripe(rbio, bio);
1448 else
1449 set_bio_pages_uptodate(bio);
1450
1451 bio_put(bio);
1452
1453 if (!atomic_dec_and_test(&rbio->stripes_pending))
1454 return;
1455
1456 if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
1457 goto cleanup;
1458
1459 /*
1460 * this will normally call finish_rmw to start our write
1461 * but if there are any failed stripes we'll reconstruct
1462 * from parity first
1463 */
1464 validate_rbio_for_rmw(rbio);
1465 return;
1466
1467 cleanup:
1468
1469 rbio_orig_end_io(rbio, BLK_STS_IOERR);
1470 }
1471
1472 /*
1473 * the stripe must be locked by the caller. It will
1474 * unlock after all the writes are done
1475 */
raid56_rmw_stripe(struct btrfs_raid_bio * rbio)1476 static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
1477 {
1478 int bios_to_read = 0;
1479 struct bio_list bio_list;
1480 int ret;
1481 int pagenr;
1482 int stripe;
1483 struct bio *bio;
1484
1485 bio_list_init(&bio_list);
1486
1487 ret = alloc_rbio_pages(rbio);
1488 if (ret)
1489 goto cleanup;
1490
1491 index_rbio_pages(rbio);
1492
1493 atomic_set(&rbio->error, 0);
1494 /*
1495 * build a list of bios to read all the missing parts of this
1496 * stripe
1497 */
1498 for (stripe = 0; stripe < rbio->nr_data; stripe++) {
1499 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
1500 struct page *page;
1501 /*
1502 * we want to find all the pages missing from
1503 * the rbio and read them from the disk. If
1504 * page_in_rbio finds a page in the bio list
1505 * we don't need to read it off the stripe.
1506 */
1507 page = page_in_rbio(rbio, stripe, pagenr, 1);
1508 if (page)
1509 continue;
1510
1511 page = rbio_stripe_page(rbio, stripe, pagenr);
1512 /*
1513 * the bio cache may have handed us an uptodate
1514 * page. If so, be happy and use it
1515 */
1516 if (PageUptodate(page))
1517 continue;
1518
1519 ret = rbio_add_io_page(rbio, &bio_list, page,
1520 stripe, pagenr, rbio->stripe_len);
1521 if (ret)
1522 goto cleanup;
1523 }
1524 }
1525
1526 bios_to_read = bio_list_size(&bio_list);
1527 if (!bios_to_read) {
1528 /*
1529 * this can happen if others have merged with
1530 * us, it means there is nothing left to read.
1531 * But if there are missing devices it may not be
1532 * safe to do the full stripe write yet.
1533 */
1534 goto finish;
1535 }
1536
1537 /*
1538 * The bioc may be freed once we submit the last bio. Make sure not to
1539 * touch it after that.
1540 */
1541 atomic_set(&rbio->stripes_pending, bios_to_read);
1542 while ((bio = bio_list_pop(&bio_list))) {
1543 bio->bi_private = rbio;
1544 bio->bi_end_io = raid_rmw_end_io;
1545 bio->bi_opf = REQ_OP_READ;
1546
1547 btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
1548
1549 submit_bio(bio);
1550 }
1551 /* the actual write will happen once the reads are done */
1552 return 0;
1553
1554 cleanup:
1555 rbio_orig_end_io(rbio, BLK_STS_IOERR);
1556
1557 while ((bio = bio_list_pop(&bio_list)))
1558 bio_put(bio);
1559
1560 return -EIO;
1561
1562 finish:
1563 validate_rbio_for_rmw(rbio);
1564 return 0;
1565 }
1566
1567 /*
1568 * if the upper layers pass in a full stripe, we thank them by only allocating
1569 * enough pages to hold the parity, and sending it all down quickly.
1570 */
full_stripe_write(struct btrfs_raid_bio * rbio)1571 static int full_stripe_write(struct btrfs_raid_bio *rbio)
1572 {
1573 int ret;
1574
1575 ret = alloc_rbio_parity_pages(rbio);
1576 if (ret) {
1577 __free_raid_bio(rbio);
1578 return ret;
1579 }
1580
1581 ret = lock_stripe_add(rbio);
1582 if (ret == 0)
1583 finish_rmw(rbio);
1584 return 0;
1585 }
1586
1587 /*
1588 * partial stripe writes get handed over to async helpers.
1589 * We're really hoping to merge a few more writes into this
1590 * rbio before calculating new parity
1591 */
partial_stripe_write(struct btrfs_raid_bio * rbio)1592 static int partial_stripe_write(struct btrfs_raid_bio *rbio)
1593 {
1594 int ret;
1595
1596 ret = lock_stripe_add(rbio);
1597 if (ret == 0)
1598 start_async_work(rbio, rmw_work);
1599 return 0;
1600 }
1601
1602 /*
1603 * sometimes while we were reading from the drive to
1604 * recalculate parity, enough new bios come into create
1605 * a full stripe. So we do a check here to see if we can
1606 * go directly to finish_rmw
1607 */
__raid56_parity_write(struct btrfs_raid_bio * rbio)1608 static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
1609 {
1610 /* head off into rmw land if we don't have a full stripe */
1611 if (!rbio_is_full(rbio))
1612 return partial_stripe_write(rbio);
1613 return full_stripe_write(rbio);
1614 }
1615
1616 /*
1617 * We use plugging call backs to collect full stripes.
1618 * Any time we get a partial stripe write while plugged
1619 * we collect it into a list. When the unplug comes down,
1620 * we sort the list by logical block number and merge
1621 * everything we can into the same rbios
1622 */
1623 struct btrfs_plug_cb {
1624 struct blk_plug_cb cb;
1625 struct btrfs_fs_info *info;
1626 struct list_head rbio_list;
1627 struct btrfs_work work;
1628 };
1629
1630 /*
1631 * rbios on the plug list are sorted for easier merging.
1632 */
plug_cmp(void * priv,const struct list_head * a,const struct list_head * b)1633 static int plug_cmp(void *priv, const struct list_head *a,
1634 const struct list_head *b)
1635 {
1636 const struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
1637 plug_list);
1638 const struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
1639 plug_list);
1640 u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
1641 u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
1642
1643 if (a_sector < b_sector)
1644 return -1;
1645 if (a_sector > b_sector)
1646 return 1;
1647 return 0;
1648 }
1649
run_plug(struct btrfs_plug_cb * plug)1650 static void run_plug(struct btrfs_plug_cb *plug)
1651 {
1652 struct btrfs_raid_bio *cur;
1653 struct btrfs_raid_bio *last = NULL;
1654
1655 /*
1656 * sort our plug list then try to merge
1657 * everything we can in hopes of creating full
1658 * stripes.
1659 */
1660 list_sort(NULL, &plug->rbio_list, plug_cmp);
1661 while (!list_empty(&plug->rbio_list)) {
1662 cur = list_entry(plug->rbio_list.next,
1663 struct btrfs_raid_bio, plug_list);
1664 list_del_init(&cur->plug_list);
1665
1666 if (rbio_is_full(cur)) {
1667 int ret;
1668
1669 /* we have a full stripe, send it down */
1670 ret = full_stripe_write(cur);
1671 BUG_ON(ret);
1672 continue;
1673 }
1674 if (last) {
1675 if (rbio_can_merge(last, cur)) {
1676 merge_rbio(last, cur);
1677 __free_raid_bio(cur);
1678 continue;
1679
1680 }
1681 __raid56_parity_write(last);
1682 }
1683 last = cur;
1684 }
1685 if (last) {
1686 __raid56_parity_write(last);
1687 }
1688 kfree(plug);
1689 }
1690
1691 /*
1692 * if the unplug comes from schedule, we have to push the
1693 * work off to a helper thread
1694 */
unplug_work(struct btrfs_work * work)1695 static void unplug_work(struct btrfs_work *work)
1696 {
1697 struct btrfs_plug_cb *plug;
1698 plug = container_of(work, struct btrfs_plug_cb, work);
1699 run_plug(plug);
1700 }
1701
btrfs_raid_unplug(struct blk_plug_cb * cb,bool from_schedule)1702 static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
1703 {
1704 struct btrfs_plug_cb *plug;
1705 plug = container_of(cb, struct btrfs_plug_cb, cb);
1706
1707 if (from_schedule) {
1708 btrfs_init_work(&plug->work, unplug_work, NULL, NULL);
1709 btrfs_queue_work(plug->info->rmw_workers,
1710 &plug->work);
1711 return;
1712 }
1713 run_plug(plug);
1714 }
1715
1716 /*
1717 * our main entry point for writes from the rest of the FS.
1718 */
raid56_parity_write(struct bio * bio,struct btrfs_io_context * bioc,u64 stripe_len)1719 int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc,
1720 u64 stripe_len)
1721 {
1722 struct btrfs_fs_info *fs_info = bioc->fs_info;
1723 struct btrfs_raid_bio *rbio;
1724 struct btrfs_plug_cb *plug = NULL;
1725 struct blk_plug_cb *cb;
1726 int ret;
1727
1728 rbio = alloc_rbio(fs_info, bioc, stripe_len);
1729 if (IS_ERR(rbio)) {
1730 btrfs_put_bioc(bioc);
1731 return PTR_ERR(rbio);
1732 }
1733 bio_list_add(&rbio->bio_list, bio);
1734 rbio->bio_list_bytes = bio->bi_iter.bi_size;
1735 rbio->operation = BTRFS_RBIO_WRITE;
1736
1737 btrfs_bio_counter_inc_noblocked(fs_info);
1738 rbio->generic_bio_cnt = 1;
1739
1740 /*
1741 * don't plug on full rbios, just get them out the door
1742 * as quickly as we can
1743 */
1744 if (rbio_is_full(rbio)) {
1745 ret = full_stripe_write(rbio);
1746 if (ret)
1747 btrfs_bio_counter_dec(fs_info);
1748 return ret;
1749 }
1750
1751 cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug));
1752 if (cb) {
1753 plug = container_of(cb, struct btrfs_plug_cb, cb);
1754 if (!plug->info) {
1755 plug->info = fs_info;
1756 INIT_LIST_HEAD(&plug->rbio_list);
1757 }
1758 list_add_tail(&rbio->plug_list, &plug->rbio_list);
1759 ret = 0;
1760 } else {
1761 ret = __raid56_parity_write(rbio);
1762 if (ret)
1763 btrfs_bio_counter_dec(fs_info);
1764 }
1765 return ret;
1766 }
1767
1768 /*
1769 * all parity reconstruction happens here. We've read in everything
1770 * we can find from the drives and this does the heavy lifting of
1771 * sorting the good from the bad.
1772 */
__raid_recover_end_io(struct btrfs_raid_bio * rbio)1773 static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1774 {
1775 int pagenr, stripe;
1776 void **pointers;
1777 void **unmap_array;
1778 int faila = -1, failb = -1;
1779 struct page *page;
1780 blk_status_t err;
1781 int i;
1782
1783 pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
1784 if (!pointers) {
1785 err = BLK_STS_RESOURCE;
1786 goto cleanup_io;
1787 }
1788
1789 /*
1790 * Store copy of pointers that does not get reordered during
1791 * reconstruction so that kunmap_local works.
1792 */
1793 unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
1794 if (!unmap_array) {
1795 err = BLK_STS_RESOURCE;
1796 goto cleanup_pointers;
1797 }
1798
1799 faila = rbio->faila;
1800 failb = rbio->failb;
1801
1802 if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1803 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
1804 spin_lock_irq(&rbio->bio_list_lock);
1805 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1806 spin_unlock_irq(&rbio->bio_list_lock);
1807 }
1808
1809 index_rbio_pages(rbio);
1810
1811 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
1812 /*
1813 * Now we just use bitmap to mark the horizontal stripes in
1814 * which we have data when doing parity scrub.
1815 */
1816 if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
1817 !test_bit(pagenr, rbio->dbitmap))
1818 continue;
1819
1820 /*
1821 * Setup our array of pointers with pages from each stripe
1822 *
1823 * NOTE: store a duplicate array of pointers to preserve the
1824 * pointer order
1825 */
1826 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1827 /*
1828 * if we're rebuilding a read, we have to use
1829 * pages from the bio list
1830 */
1831 if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1832 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
1833 (stripe == faila || stripe == failb)) {
1834 page = page_in_rbio(rbio, stripe, pagenr, 0);
1835 } else {
1836 page = rbio_stripe_page(rbio, stripe, pagenr);
1837 }
1838 pointers[stripe] = kmap_local_page(page);
1839 unmap_array[stripe] = pointers[stripe];
1840 }
1841
1842 /* all raid6 handling here */
1843 if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) {
1844 /*
1845 * single failure, rebuild from parity raid5
1846 * style
1847 */
1848 if (failb < 0) {
1849 if (faila == rbio->nr_data) {
1850 /*
1851 * Just the P stripe has failed, without
1852 * a bad data or Q stripe.
1853 * TODO, we should redo the xor here.
1854 */
1855 err = BLK_STS_IOERR;
1856 goto cleanup;
1857 }
1858 /*
1859 * a single failure in raid6 is rebuilt
1860 * in the pstripe code below
1861 */
1862 goto pstripe;
1863 }
1864
1865 /* make sure our ps and qs are in order */
1866 if (faila > failb)
1867 swap(faila, failb);
1868
1869 /* if the q stripe is failed, do a pstripe reconstruction
1870 * from the xors.
1871 * If both the q stripe and the P stripe are failed, we're
1872 * here due to a crc mismatch and we can't give them the
1873 * data they want
1874 */
1875 if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) {
1876 if (rbio->bioc->raid_map[faila] ==
1877 RAID5_P_STRIPE) {
1878 err = BLK_STS_IOERR;
1879 goto cleanup;
1880 }
1881 /*
1882 * otherwise we have one bad data stripe and
1883 * a good P stripe. raid5!
1884 */
1885 goto pstripe;
1886 }
1887
1888 if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) {
1889 raid6_datap_recov(rbio->real_stripes,
1890 PAGE_SIZE, faila, pointers);
1891 } else {
1892 raid6_2data_recov(rbio->real_stripes,
1893 PAGE_SIZE, faila, failb,
1894 pointers);
1895 }
1896 } else {
1897 void *p;
1898
1899 /* rebuild from P stripe here (raid5 or raid6) */
1900 BUG_ON(failb != -1);
1901 pstripe:
1902 /* Copy parity block into failed block to start with */
1903 copy_page(pointers[faila], pointers[rbio->nr_data]);
1904
1905 /* rearrange the pointer array */
1906 p = pointers[faila];
1907 for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
1908 pointers[stripe] = pointers[stripe + 1];
1909 pointers[rbio->nr_data - 1] = p;
1910
1911 /* xor in the rest */
1912 run_xor(pointers, rbio->nr_data - 1, PAGE_SIZE);
1913 }
1914 /* if we're doing this rebuild as part of an rmw, go through
1915 * and set all of our private rbio pages in the
1916 * failed stripes as uptodate. This way finish_rmw will
1917 * know they can be trusted. If this was a read reconstruction,
1918 * other endio functions will fiddle the uptodate bits
1919 */
1920 if (rbio->operation == BTRFS_RBIO_WRITE) {
1921 for (i = 0; i < rbio->stripe_npages; i++) {
1922 if (faila != -1) {
1923 page = rbio_stripe_page(rbio, faila, i);
1924 SetPageUptodate(page);
1925 }
1926 if (failb != -1) {
1927 page = rbio_stripe_page(rbio, failb, i);
1928 SetPageUptodate(page);
1929 }
1930 }
1931 }
1932 for (stripe = rbio->real_stripes - 1; stripe >= 0; stripe--)
1933 kunmap_local(unmap_array[stripe]);
1934 }
1935
1936 err = BLK_STS_OK;
1937 cleanup:
1938 kfree(unmap_array);
1939 cleanup_pointers:
1940 kfree(pointers);
1941
1942 cleanup_io:
1943 /*
1944 * Similar to READ_REBUILD, REBUILD_MISSING at this point also has a
1945 * valid rbio which is consistent with ondisk content, thus such a
1946 * valid rbio can be cached to avoid further disk reads.
1947 */
1948 if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1949 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
1950 /*
1951 * - In case of two failures, where rbio->failb != -1:
1952 *
1953 * Do not cache this rbio since the above read reconstruction
1954 * (raid6_datap_recov() or raid6_2data_recov()) may have
1955 * changed some content of stripes which are not identical to
1956 * on-disk content any more, otherwise, a later write/recover
1957 * may steal stripe_pages from this rbio and end up with
1958 * corruptions or rebuild failures.
1959 *
1960 * - In case of single failure, where rbio->failb == -1:
1961 *
1962 * Cache this rbio iff the above read reconstruction is
1963 * executed without problems.
1964 */
1965 if (err == BLK_STS_OK && rbio->failb < 0)
1966 cache_rbio_pages(rbio);
1967 else
1968 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
1969
1970 rbio_orig_end_io(rbio, err);
1971 } else if (err == BLK_STS_OK) {
1972 rbio->faila = -1;
1973 rbio->failb = -1;
1974
1975 if (rbio->operation == BTRFS_RBIO_WRITE)
1976 finish_rmw(rbio);
1977 else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB)
1978 finish_parity_scrub(rbio, 0);
1979 else
1980 BUG();
1981 } else {
1982 rbio_orig_end_io(rbio, err);
1983 }
1984 }
1985
1986 /*
1987 * This is called only for stripes we've read from disk to
1988 * reconstruct the parity.
1989 */
raid_recover_end_io(struct bio * bio)1990 static void raid_recover_end_io(struct bio *bio)
1991 {
1992 struct btrfs_raid_bio *rbio = bio->bi_private;
1993
1994 /*
1995 * we only read stripe pages off the disk, set them
1996 * up to date if there were no errors
1997 */
1998 if (bio->bi_status)
1999 fail_bio_stripe(rbio, bio);
2000 else
2001 set_bio_pages_uptodate(bio);
2002 bio_put(bio);
2003
2004 if (!atomic_dec_and_test(&rbio->stripes_pending))
2005 return;
2006
2007 if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
2008 rbio_orig_end_io(rbio, BLK_STS_IOERR);
2009 else
2010 __raid_recover_end_io(rbio);
2011 }
2012
2013 /*
2014 * reads everything we need off the disk to reconstruct
2015 * the parity. endio handlers trigger final reconstruction
2016 * when the IO is done.
2017 *
2018 * This is used both for reads from the higher layers and for
2019 * parity construction required to finish a rmw cycle.
2020 */
__raid56_parity_recover(struct btrfs_raid_bio * rbio)2021 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
2022 {
2023 int bios_to_read = 0;
2024 struct bio_list bio_list;
2025 int ret;
2026 int pagenr;
2027 int stripe;
2028 struct bio *bio;
2029
2030 bio_list_init(&bio_list);
2031
2032 ret = alloc_rbio_pages(rbio);
2033 if (ret)
2034 goto cleanup;
2035
2036 atomic_set(&rbio->error, 0);
2037
2038 /*
2039 * read everything that hasn't failed. Thanks to the
2040 * stripe cache, it is possible that some or all of these
2041 * pages are going to be uptodate.
2042 */
2043 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
2044 if (rbio->faila == stripe || rbio->failb == stripe) {
2045 atomic_inc(&rbio->error);
2046 continue;
2047 }
2048
2049 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
2050 struct page *p;
2051
2052 /*
2053 * the rmw code may have already read this
2054 * page in
2055 */
2056 p = rbio_stripe_page(rbio, stripe, pagenr);
2057 if (PageUptodate(p))
2058 continue;
2059
2060 ret = rbio_add_io_page(rbio, &bio_list,
2061 rbio_stripe_page(rbio, stripe, pagenr),
2062 stripe, pagenr, rbio->stripe_len);
2063 if (ret < 0)
2064 goto cleanup;
2065 }
2066 }
2067
2068 bios_to_read = bio_list_size(&bio_list);
2069 if (!bios_to_read) {
2070 /*
2071 * we might have no bios to read just because the pages
2072 * were up to date, or we might have no bios to read because
2073 * the devices were gone.
2074 */
2075 if (atomic_read(&rbio->error) <= rbio->bioc->max_errors) {
2076 __raid_recover_end_io(rbio);
2077 return 0;
2078 } else {
2079 goto cleanup;
2080 }
2081 }
2082
2083 /*
2084 * The bioc may be freed once we submit the last bio. Make sure not to
2085 * touch it after that.
2086 */
2087 atomic_set(&rbio->stripes_pending, bios_to_read);
2088 while ((bio = bio_list_pop(&bio_list))) {
2089 bio->bi_private = rbio;
2090 bio->bi_end_io = raid_recover_end_io;
2091 bio->bi_opf = REQ_OP_READ;
2092
2093 btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
2094
2095 submit_bio(bio);
2096 }
2097
2098 return 0;
2099
2100 cleanup:
2101 if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
2102 rbio->operation == BTRFS_RBIO_REBUILD_MISSING)
2103 rbio_orig_end_io(rbio, BLK_STS_IOERR);
2104
2105 while ((bio = bio_list_pop(&bio_list)))
2106 bio_put(bio);
2107
2108 return -EIO;
2109 }
2110
2111 /*
2112 * the main entry point for reads from the higher layers. This
2113 * is really only called when the normal read path had a failure,
2114 * so we assume the bio they send down corresponds to a failed part
2115 * of the drive.
2116 */
raid56_parity_recover(struct bio * bio,struct btrfs_io_context * bioc,u64 stripe_len,int mirror_num,int generic_io)2117 int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
2118 u64 stripe_len, int mirror_num, int generic_io)
2119 {
2120 struct btrfs_fs_info *fs_info = bioc->fs_info;
2121 struct btrfs_raid_bio *rbio;
2122 int ret;
2123
2124 if (generic_io) {
2125 ASSERT(bioc->mirror_num == mirror_num);
2126 btrfs_bio(bio)->mirror_num = mirror_num;
2127 }
2128
2129 rbio = alloc_rbio(fs_info, bioc, stripe_len);
2130 if (IS_ERR(rbio)) {
2131 if (generic_io)
2132 btrfs_put_bioc(bioc);
2133 return PTR_ERR(rbio);
2134 }
2135
2136 rbio->operation = BTRFS_RBIO_READ_REBUILD;
2137 bio_list_add(&rbio->bio_list, bio);
2138 rbio->bio_list_bytes = bio->bi_iter.bi_size;
2139
2140 rbio->faila = find_logical_bio_stripe(rbio, bio);
2141 if (rbio->faila == -1) {
2142 btrfs_warn(fs_info,
2143 "%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bioc has map_type %llu)",
2144 __func__, bio->bi_iter.bi_sector << 9,
2145 (u64)bio->bi_iter.bi_size, bioc->map_type);
2146 if (generic_io)
2147 btrfs_put_bioc(bioc);
2148 kfree(rbio);
2149 return -EIO;
2150 }
2151
2152 if (generic_io) {
2153 btrfs_bio_counter_inc_noblocked(fs_info);
2154 rbio->generic_bio_cnt = 1;
2155 } else {
2156 btrfs_get_bioc(bioc);
2157 }
2158
2159 /*
2160 * Loop retry:
2161 * for 'mirror == 2', reconstruct from all other stripes.
2162 * for 'mirror_num > 2', select a stripe to fail on every retry.
2163 */
2164 if (mirror_num > 2) {
2165 /*
2166 * 'mirror == 3' is to fail the p stripe and
2167 * reconstruct from the q stripe. 'mirror > 3' is to
2168 * fail a data stripe and reconstruct from p+q stripe.
2169 */
2170 rbio->failb = rbio->real_stripes - (mirror_num - 1);
2171 ASSERT(rbio->failb > 0);
2172 if (rbio->failb <= rbio->faila)
2173 rbio->failb--;
2174 }
2175
2176 ret = lock_stripe_add(rbio);
2177
2178 /*
2179 * __raid56_parity_recover will end the bio with
2180 * any errors it hits. We don't want to return
2181 * its error value up the stack because our caller
2182 * will end up calling bio_endio with any nonzero
2183 * return
2184 */
2185 if (ret == 0)
2186 __raid56_parity_recover(rbio);
2187 /*
2188 * our rbio has been added to the list of
2189 * rbios that will be handled after the
2190 * currently lock owner is done
2191 */
2192 return 0;
2193
2194 }
2195
rmw_work(struct btrfs_work * work)2196 static void rmw_work(struct btrfs_work *work)
2197 {
2198 struct btrfs_raid_bio *rbio;
2199
2200 rbio = container_of(work, struct btrfs_raid_bio, work);
2201 raid56_rmw_stripe(rbio);
2202 }
2203
read_rebuild_work(struct btrfs_work * work)2204 static void read_rebuild_work(struct btrfs_work *work)
2205 {
2206 struct btrfs_raid_bio *rbio;
2207
2208 rbio = container_of(work, struct btrfs_raid_bio, work);
2209 __raid56_parity_recover(rbio);
2210 }
2211
2212 /*
2213 * The following code is used to scrub/replace the parity stripe
2214 *
2215 * Caller must have already increased bio_counter for getting @bioc.
2216 *
2217 * Note: We need make sure all the pages that add into the scrub/replace
2218 * raid bio are correct and not be changed during the scrub/replace. That
2219 * is those pages just hold metadata or file data with checksum.
2220 */
2221
raid56_parity_alloc_scrub_rbio(struct bio * bio,struct btrfs_io_context * bioc,u64 stripe_len,struct btrfs_device * scrub_dev,unsigned long * dbitmap,int stripe_nsectors)2222 struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
2223 struct btrfs_io_context *bioc,
2224 u64 stripe_len, struct btrfs_device *scrub_dev,
2225 unsigned long *dbitmap, int stripe_nsectors)
2226 {
2227 struct btrfs_fs_info *fs_info = bioc->fs_info;
2228 struct btrfs_raid_bio *rbio;
2229 int i;
2230
2231 rbio = alloc_rbio(fs_info, bioc, stripe_len);
2232 if (IS_ERR(rbio))
2233 return NULL;
2234 bio_list_add(&rbio->bio_list, bio);
2235 /*
2236 * This is a special bio which is used to hold the completion handler
2237 * and make the scrub rbio is similar to the other types
2238 */
2239 ASSERT(!bio->bi_iter.bi_size);
2240 rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
2241
2242 /*
2243 * After mapping bioc with BTRFS_MAP_WRITE, parities have been sorted
2244 * to the end position, so this search can start from the first parity
2245 * stripe.
2246 */
2247 for (i = rbio->nr_data; i < rbio->real_stripes; i++) {
2248 if (bioc->stripes[i].dev == scrub_dev) {
2249 rbio->scrubp = i;
2250 break;
2251 }
2252 }
2253 ASSERT(i < rbio->real_stripes);
2254
2255 /* Now we just support the sectorsize equals to page size */
2256 ASSERT(fs_info->sectorsize == PAGE_SIZE);
2257 ASSERT(rbio->stripe_npages == stripe_nsectors);
2258 bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors);
2259
2260 /*
2261 * We have already increased bio_counter when getting bioc, record it
2262 * so we can free it at rbio_orig_end_io().
2263 */
2264 rbio->generic_bio_cnt = 1;
2265
2266 return rbio;
2267 }
2268
2269 /* Used for both parity scrub and missing. */
raid56_add_scrub_pages(struct btrfs_raid_bio * rbio,struct page * page,u64 logical)2270 void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
2271 u64 logical)
2272 {
2273 int stripe_offset;
2274 int index;
2275
2276 ASSERT(logical >= rbio->bioc->raid_map[0]);
2277 ASSERT(logical + PAGE_SIZE <= rbio->bioc->raid_map[0] +
2278 rbio->stripe_len * rbio->nr_data);
2279 stripe_offset = (int)(logical - rbio->bioc->raid_map[0]);
2280 index = stripe_offset >> PAGE_SHIFT;
2281 rbio->bio_pages[index] = page;
2282 }
2283
2284 /*
2285 * We just scrub the parity that we have correct data on the same horizontal,
2286 * so we needn't allocate all pages for all the stripes.
2287 */
alloc_rbio_essential_pages(struct btrfs_raid_bio * rbio)2288 static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
2289 {
2290 int i;
2291 int bit;
2292 int index;
2293 struct page *page;
2294
2295 for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) {
2296 for (i = 0; i < rbio->real_stripes; i++) {
2297 index = i * rbio->stripe_npages + bit;
2298 if (rbio->stripe_pages[index])
2299 continue;
2300
2301 page = alloc_page(GFP_NOFS);
2302 if (!page)
2303 return -ENOMEM;
2304 rbio->stripe_pages[index] = page;
2305 }
2306 }
2307 return 0;
2308 }
2309
finish_parity_scrub(struct btrfs_raid_bio * rbio,int need_check)2310 static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
2311 int need_check)
2312 {
2313 struct btrfs_io_context *bioc = rbio->bioc;
2314 void **pointers = rbio->finish_pointers;
2315 unsigned long *pbitmap = rbio->finish_pbitmap;
2316 int nr_data = rbio->nr_data;
2317 int stripe;
2318 int pagenr;
2319 bool has_qstripe;
2320 struct page *p_page = NULL;
2321 struct page *q_page = NULL;
2322 struct bio_list bio_list;
2323 struct bio *bio;
2324 int is_replace = 0;
2325 int ret;
2326
2327 bio_list_init(&bio_list);
2328
2329 if (rbio->real_stripes - rbio->nr_data == 1)
2330 has_qstripe = false;
2331 else if (rbio->real_stripes - rbio->nr_data == 2)
2332 has_qstripe = true;
2333 else
2334 BUG();
2335
2336 if (bioc->num_tgtdevs && bioc->tgtdev_map[rbio->scrubp]) {
2337 is_replace = 1;
2338 bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages);
2339 }
2340
2341 /*
2342 * Because the higher layers(scrubber) are unlikely to
2343 * use this area of the disk again soon, so don't cache
2344 * it.
2345 */
2346 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2347
2348 if (!need_check)
2349 goto writeback;
2350
2351 p_page = alloc_page(GFP_NOFS);
2352 if (!p_page)
2353 goto cleanup;
2354 SetPageUptodate(p_page);
2355
2356 if (has_qstripe) {
2357 /* RAID6, allocate and map temp space for the Q stripe */
2358 q_page = alloc_page(GFP_NOFS);
2359 if (!q_page) {
2360 __free_page(p_page);
2361 goto cleanup;
2362 }
2363 SetPageUptodate(q_page);
2364 pointers[rbio->real_stripes - 1] = kmap_local_page(q_page);
2365 }
2366
2367 atomic_set(&rbio->error, 0);
2368
2369 /* Map the parity stripe just once */
2370 pointers[nr_data] = kmap_local_page(p_page);
2371
2372 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
2373 struct page *p;
2374 void *parity;
2375 /* first collect one page from each data stripe */
2376 for (stripe = 0; stripe < nr_data; stripe++) {
2377 p = page_in_rbio(rbio, stripe, pagenr, 0);
2378 pointers[stripe] = kmap_local_page(p);
2379 }
2380
2381 if (has_qstripe) {
2382 /* RAID6, call the library function to fill in our P/Q */
2383 raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
2384 pointers);
2385 } else {
2386 /* raid5 */
2387 copy_page(pointers[nr_data], pointers[0]);
2388 run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
2389 }
2390
2391 /* Check scrubbing parity and repair it */
2392 p = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
2393 parity = kmap_local_page(p);
2394 if (memcmp(parity, pointers[rbio->scrubp], PAGE_SIZE))
2395 copy_page(parity, pointers[rbio->scrubp]);
2396 else
2397 /* Parity is right, needn't writeback */
2398 bitmap_clear(rbio->dbitmap, pagenr, 1);
2399 kunmap_local(parity);
2400
2401 for (stripe = nr_data - 1; stripe >= 0; stripe--)
2402 kunmap_local(pointers[stripe]);
2403 }
2404
2405 kunmap_local(pointers[nr_data]);
2406 __free_page(p_page);
2407 if (q_page) {
2408 kunmap_local(pointers[rbio->real_stripes - 1]);
2409 __free_page(q_page);
2410 }
2411
2412 writeback:
2413 /*
2414 * time to start writing. Make bios for everything from the
2415 * higher layers (the bio_list in our rbio) and our p/q. Ignore
2416 * everything else.
2417 */
2418 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
2419 struct page *page;
2420
2421 page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
2422 ret = rbio_add_io_page(rbio, &bio_list,
2423 page, rbio->scrubp, pagenr, rbio->stripe_len);
2424 if (ret)
2425 goto cleanup;
2426 }
2427
2428 if (!is_replace)
2429 goto submit_write;
2430
2431 for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) {
2432 struct page *page;
2433
2434 page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
2435 ret = rbio_add_io_page(rbio, &bio_list, page,
2436 bioc->tgtdev_map[rbio->scrubp],
2437 pagenr, rbio->stripe_len);
2438 if (ret)
2439 goto cleanup;
2440 }
2441
2442 submit_write:
2443 nr_data = bio_list_size(&bio_list);
2444 if (!nr_data) {
2445 /* Every parity is right */
2446 rbio_orig_end_io(rbio, BLK_STS_OK);
2447 return;
2448 }
2449
2450 atomic_set(&rbio->stripes_pending, nr_data);
2451
2452 while ((bio = bio_list_pop(&bio_list))) {
2453 bio->bi_private = rbio;
2454 bio->bi_end_io = raid_write_end_io;
2455 bio->bi_opf = REQ_OP_WRITE;
2456
2457 submit_bio(bio);
2458 }
2459 return;
2460
2461 cleanup:
2462 rbio_orig_end_io(rbio, BLK_STS_IOERR);
2463
2464 while ((bio = bio_list_pop(&bio_list)))
2465 bio_put(bio);
2466 }
2467
is_data_stripe(struct btrfs_raid_bio * rbio,int stripe)2468 static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
2469 {
2470 if (stripe >= 0 && stripe < rbio->nr_data)
2471 return 1;
2472 return 0;
2473 }
2474
2475 /*
2476 * While we're doing the parity check and repair, we could have errors
2477 * in reading pages off the disk. This checks for errors and if we're
2478 * not able to read the page it'll trigger parity reconstruction. The
2479 * parity scrub will be finished after we've reconstructed the failed
2480 * stripes
2481 */
validate_rbio_for_parity_scrub(struct btrfs_raid_bio * rbio)2482 static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
2483 {
2484 if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
2485 goto cleanup;
2486
2487 if (rbio->faila >= 0 || rbio->failb >= 0) {
2488 int dfail = 0, failp = -1;
2489
2490 if (is_data_stripe(rbio, rbio->faila))
2491 dfail++;
2492 else if (is_parity_stripe(rbio->faila))
2493 failp = rbio->faila;
2494
2495 if (is_data_stripe(rbio, rbio->failb))
2496 dfail++;
2497 else if (is_parity_stripe(rbio->failb))
2498 failp = rbio->failb;
2499
2500 /*
2501 * Because we can not use a scrubbing parity to repair
2502 * the data, so the capability of the repair is declined.
2503 * (In the case of RAID5, we can not repair anything)
2504 */
2505 if (dfail > rbio->bioc->max_errors - 1)
2506 goto cleanup;
2507
2508 /*
2509 * If all data is good, only parity is correctly, just
2510 * repair the parity.
2511 */
2512 if (dfail == 0) {
2513 finish_parity_scrub(rbio, 0);
2514 return;
2515 }
2516
2517 /*
2518 * Here means we got one corrupted data stripe and one
2519 * corrupted parity on RAID6, if the corrupted parity
2520 * is scrubbing parity, luckily, use the other one to repair
2521 * the data, or we can not repair the data stripe.
2522 */
2523 if (failp != rbio->scrubp)
2524 goto cleanup;
2525
2526 __raid_recover_end_io(rbio);
2527 } else {
2528 finish_parity_scrub(rbio, 1);
2529 }
2530 return;
2531
2532 cleanup:
2533 rbio_orig_end_io(rbio, BLK_STS_IOERR);
2534 }
2535
2536 /*
2537 * end io for the read phase of the rmw cycle. All the bios here are physical
2538 * stripe bios we've read from the disk so we can recalculate the parity of the
2539 * stripe.
2540 *
2541 * This will usually kick off finish_rmw once all the bios are read in, but it
2542 * may trigger parity reconstruction if we had any errors along the way
2543 */
raid56_parity_scrub_end_io(struct bio * bio)2544 static void raid56_parity_scrub_end_io(struct bio *bio)
2545 {
2546 struct btrfs_raid_bio *rbio = bio->bi_private;
2547
2548 if (bio->bi_status)
2549 fail_bio_stripe(rbio, bio);
2550 else
2551 set_bio_pages_uptodate(bio);
2552
2553 bio_put(bio);
2554
2555 if (!atomic_dec_and_test(&rbio->stripes_pending))
2556 return;
2557
2558 /*
2559 * this will normally call finish_rmw to start our write
2560 * but if there are any failed stripes we'll reconstruct
2561 * from parity first
2562 */
2563 validate_rbio_for_parity_scrub(rbio);
2564 }
2565
raid56_parity_scrub_stripe(struct btrfs_raid_bio * rbio)2566 static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
2567 {
2568 int bios_to_read = 0;
2569 struct bio_list bio_list;
2570 int ret;
2571 int pagenr;
2572 int stripe;
2573 struct bio *bio;
2574
2575 bio_list_init(&bio_list);
2576
2577 ret = alloc_rbio_essential_pages(rbio);
2578 if (ret)
2579 goto cleanup;
2580
2581 atomic_set(&rbio->error, 0);
2582 /*
2583 * build a list of bios to read all the missing parts of this
2584 * stripe
2585 */
2586 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
2587 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
2588 struct page *page;
2589 /*
2590 * we want to find all the pages missing from
2591 * the rbio and read them from the disk. If
2592 * page_in_rbio finds a page in the bio list
2593 * we don't need to read it off the stripe.
2594 */
2595 page = page_in_rbio(rbio, stripe, pagenr, 1);
2596 if (page)
2597 continue;
2598
2599 page = rbio_stripe_page(rbio, stripe, pagenr);
2600 /*
2601 * the bio cache may have handed us an uptodate
2602 * page. If so, be happy and use it
2603 */
2604 if (PageUptodate(page))
2605 continue;
2606
2607 ret = rbio_add_io_page(rbio, &bio_list, page,
2608 stripe, pagenr, rbio->stripe_len);
2609 if (ret)
2610 goto cleanup;
2611 }
2612 }
2613
2614 bios_to_read = bio_list_size(&bio_list);
2615 if (!bios_to_read) {
2616 /*
2617 * this can happen if others have merged with
2618 * us, it means there is nothing left to read.
2619 * But if there are missing devices it may not be
2620 * safe to do the full stripe write yet.
2621 */
2622 goto finish;
2623 }
2624
2625 /*
2626 * The bioc may be freed once we submit the last bio. Make sure not to
2627 * touch it after that.
2628 */
2629 atomic_set(&rbio->stripes_pending, bios_to_read);
2630 while ((bio = bio_list_pop(&bio_list))) {
2631 bio->bi_private = rbio;
2632 bio->bi_end_io = raid56_parity_scrub_end_io;
2633 bio->bi_opf = REQ_OP_READ;
2634
2635 btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
2636
2637 submit_bio(bio);
2638 }
2639 /* the actual write will happen once the reads are done */
2640 return;
2641
2642 cleanup:
2643 rbio_orig_end_io(rbio, BLK_STS_IOERR);
2644
2645 while ((bio = bio_list_pop(&bio_list)))
2646 bio_put(bio);
2647
2648 return;
2649
2650 finish:
2651 validate_rbio_for_parity_scrub(rbio);
2652 }
2653
scrub_parity_work(struct btrfs_work * work)2654 static void scrub_parity_work(struct btrfs_work *work)
2655 {
2656 struct btrfs_raid_bio *rbio;
2657
2658 rbio = container_of(work, struct btrfs_raid_bio, work);
2659 raid56_parity_scrub_stripe(rbio);
2660 }
2661
raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio * rbio)2662 void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
2663 {
2664 if (!lock_stripe_add(rbio))
2665 start_async_work(rbio, scrub_parity_work);
2666 }
2667
2668 /* The following code is used for dev replace of a missing RAID 5/6 device. */
2669
2670 struct btrfs_raid_bio *
raid56_alloc_missing_rbio(struct bio * bio,struct btrfs_io_context * bioc,u64 length)2671 raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc,
2672 u64 length)
2673 {
2674 struct btrfs_fs_info *fs_info = bioc->fs_info;
2675 struct btrfs_raid_bio *rbio;
2676
2677 rbio = alloc_rbio(fs_info, bioc, length);
2678 if (IS_ERR(rbio))
2679 return NULL;
2680
2681 rbio->operation = BTRFS_RBIO_REBUILD_MISSING;
2682 bio_list_add(&rbio->bio_list, bio);
2683 /*
2684 * This is a special bio which is used to hold the completion handler
2685 * and make the scrub rbio is similar to the other types
2686 */
2687 ASSERT(!bio->bi_iter.bi_size);
2688
2689 rbio->faila = find_logical_bio_stripe(rbio, bio);
2690 if (rbio->faila == -1) {
2691 BUG();
2692 kfree(rbio);
2693 return NULL;
2694 }
2695
2696 /*
2697 * When we get bioc, we have already increased bio_counter, record it
2698 * so we can free it at rbio_orig_end_io()
2699 */
2700 rbio->generic_bio_cnt = 1;
2701
2702 return rbio;
2703 }
2704
raid56_submit_missing_rbio(struct btrfs_raid_bio * rbio)2705 void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio)
2706 {
2707 if (!lock_stripe_add(rbio))
2708 start_async_work(rbio, read_rebuild_work);
2709 }
2710