diff --git a/include/sys/vdev_raidz_impl.h b/include/sys/vdev_raidz_impl.h index 45cb5864a22b..8b992e95dbb6 100644 --- a/include/sys/vdev_raidz_impl.h +++ b/include/sys/vdev_raidz_impl.h @@ -118,6 +118,7 @@ typedef struct raidz_col { uint8_t rc_need_orig_restore:1; /* need to restore from orig_data? */ uint8_t rc_force_repair:1; /* Write good data to this column */ uint8_t rc_allow_repair:1; /* Allow repair I/O to this column */ + uint8_t rc_latency_outlier:1; /* Latency outlier for this device */ int rc_shadow_devidx; /* for double write during expansion */ int rc_shadow_error; /* for double write during expansion */ uint64_t rc_shadow_offset; /* for double write during expansion */ @@ -132,6 +133,7 @@ typedef struct raidz_row { int rr_firstdatacol; /* First data column/parity count */ abd_t *rr_abd_empty; /* dRAID empty sector buffer */ int rr_nempty; /* empty sectors included in parity */ + int rr_noutliers; /* Count of latency outlier devices */ #ifdef ZFS_DEBUG uint64_t rr_offset; /* Logical offset for *_io_verify() */ uint64_t rr_size; /* Physical size for *_io_verify() */ diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c index aae8acced89a..20b102f697d2 100644 --- a/module/zfs/vdev_draid.c +++ b/module/zfs/vdev_draid.c @@ -1889,17 +1889,6 @@ vdev_draid_io_start_read(zio_t *zio, raidz_row_t *rr) /* Sequential rebuild must do IO at redundancy group boundary. */ IMPLY(zio->io_priority == ZIO_PRIORITY_REBUILD, rr->rr_nempty == 0); - /* - * Calculate how much parity is available for sitting out reads - */ - int parity_avail = rr->rr_firstdatacol; - for (int p = 0; p < rr->rr_firstdatacol; p++) { - raidz_col_t *rc = &rr->rr_col[p]; - if (!vdev_draid_readable(vd->vdev_child[rc->rc_devidx], - rc->rc_offset)) { - parity_avail--; - } - } /* * Iterate over the columns in reverse order so that we hit the parity * last. Any errors along the way will force us to read the parity. @@ -2004,14 +1993,29 @@ vdev_draid_io_start_read(zio_t *zio, raidz_row_t *rr) rc->rc_force_repair = 1; rc->rc_allow_repair = 1; } - } else if (parity_avail > 0 && c >= rr->rr_firstdatacol && - rr->rr_missingdata == 0 && - vdev_skip_latency_outlier(cvd, zio->io_flags)) { - rr->rr_missingdata++; - rc->rc_error = SET_ERROR(EAGAIN); - rc->rc_skipped = 1; - parity_avail--; - continue; + } else if (vdev_skip_latency_outlier(cvd, zio->io_flags)) { + rr->rr_noutliers++; + rc->rc_latency_outlier = 1; + } + } + + /* + * When the row contains a latency outlier and sufficient parity + * exists to reconstruct the column data, then skip reading the + * known slow child vdev as a performance optimization. + */ + if (rr->rr_noutliers > 0 && rr->rr_missingdata == 0 && + (rr->rr_firstdatacol - rr->rr_missingparity) > 0) { + + for (int c = rr->rr_cols - 1; c >= rr->rr_firstdatacol; c--) { + raidz_col_t *rc = &rr->rr_col[c]; + + if (rc->rc_latency_outlier) { + rr->rr_missingdata++; + rc->rc_error = SET_ERROR(EAGAIN); + rc->rc_skipped = 1; + break; + } } } diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index b905108e575c..bc6d378f1771 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -2482,18 +2482,6 @@ vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity) { vdev_t *vd = zio->io_vd; - /* - * Calculate how much parity is available for sitting out reads - */ - int parity_avail = rr->rr_firstdatacol; - for (int p = 0; p < rr->rr_firstdatacol; p++) { - raidz_col_t *rc = &rr->rr_col[p]; - if (rc->rc_size > 0 && - !vdev_readable(vd->vdev_child[rc->rc_devidx])) { - parity_avail--; - } - } - /* * Iterate over the columns in reverse order so that we hit the parity * last -- any errors along the way will force us to read the parity. @@ -2513,19 +2501,6 @@ vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity) rc->rc_skipped = 1; continue; } - /* - * Check if a data colummn read should be skipped - */ - if (parity_avail > 0 && - c >= rr->rr_firstdatacol && - rr->rr_missingdata == 0 && - vdev_skip_latency_outlier(cvd, zio->io_flags)) { - rr->rr_missingdata++; - rc->rc_error = SET_ERROR(EAGAIN); - rc->rc_skipped = 1; - parity_avail--; - continue; - } if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { if (c >= rr->rr_firstdatacol) rr->rr_missingdata++; @@ -2535,6 +2510,40 @@ vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity) rc->rc_skipped = 1; continue; } + + if (vdev_skip_latency_outlier(cvd, zio->io_flags)) { + rr->rr_noutliers++; + rc->rc_latency_outlier = 1; + } + } + + /* + * When the row contains a latency outlier and sufficient parity + * exists to reconstruct the column data, then skip reading the + * known slow child vdev as a performance optimization. + */ + if (rr->rr_noutliers > 0 && rr->rr_missingdata == 0 && + (rr->rr_firstdatacol - rr->rr_missingparity) > 0) { + + for (int c = rr->rr_cols - 1; c >= rr->rr_firstdatacol; c--) { + raidz_col_t *rc = &rr->rr_col[c]; + + if (rc->rc_latency_outlier) { + rr->rr_missingdata++; + rc->rc_error = SET_ERROR(EAGAIN); + rc->rc_skipped = 1; + break; + } + } + } + + for (int c = rr->rr_cols - 1; c >= 0; c--) { + raidz_col_t *rc = &rr->rr_col[c]; + vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; + + if (rc->rc_error || rc->rc_size == 0) + continue; + if (forceparity || c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 || (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {