#! /bin/bash
# SPDX-License-Identifier: GPL-2.0-or-later
# Copyright (c) 2022 Oracle.  All Rights Reserved.
#
# FS QA Test No. 556
#
# Check xfs_scrub's media scan can actually return diagnostic information for
# media errors in file data extents.

. ./common/preamble
_begin_fstest auto quick scrub eio

# Override the default cleanup function.
_cleanup()
{
	cd /
	rm -f $tmp.*
	_dmerror_cleanup
}

# Import common functions.
. ./common/fuzzy
. ./common/filter
. ./common/dmerror

# real QA test starts here
_supported_fs xfs
_require_dm_target error
_require_scratch
_require_scratch_xfs_crc
_require_scrub

filter_scrub_errors() {
	_filter_scratch | sed \
		-e "s/offset $((fs_blksz * 2)) /offset 2FSB /g" \
		-e "s/length $fs_blksz.*/length 1FSB./g"
}

_scratch_mkfs >> $seqres.full
_dmerror_init
_dmerror_mount >> $seqres.full 2>&1

_supports_xfs_scrub $SCRATCH_MNT $SCRATCH_DEV || _notrun "Scrub not supported"

# Write a file with 4 file blocks worth of data
victim=$SCRATCH_MNT/a
file_blksz=$(_get_file_block_size $SCRATCH_MNT)
$XFS_IO_PROG -f -c "pwrite -S 0x58 0 $((4 * file_blksz))" -c "fsync" $victim >> $seqres.full
unset errordev
_xfs_is_realtime_file $victim && errordev="RT"
bmap_str="$($XFS_IO_PROG -c "bmap -elpv" $victim | grep "^[[:space:]]*0:")"
echo "$errordev:$bmap_str" >> $seqres.full

phys="$(echo "$bmap_str" | $AWK_PROG '{print $3}')"
if [ "$errordev" = "RT" ]; then
	len="$(echo "$bmap_str" | $AWK_PROG '{print $4}')"
else
	len="$(echo "$bmap_str" | $AWK_PROG '{print $6}')"
fi
fs_blksz=$(_get_block_size $SCRATCH_MNT)
echo "file_blksz:$file_blksz:fs_blksz:$fs_blksz" >> $seqres.full
kernel_sectors_per_fs_block=$((fs_blksz / 512))

# Did we get at least 4 fs blocks worth of extent?
min_len_sectors=$(( 4 * kernel_sectors_per_fs_block ))
test "$len" -lt $min_len_sectors && \
	_fail "could not format a long enough extent on an empty fs??"

phys_start=$(echo "$phys" | sed -e 's/\.\..*//g')

echo "$errordev:$phys:$len:$fs_blksz:$phys_start" >> $seqres.full
echo "victim file:" >> $seqres.full
od -tx1 -Ad -c $victim >> $seqres.full

# Set the dmerror table so that all IO will pass through.
_dmerror_reset_table

cat >> $seqres.full << ENDL
dmerror before:
$DMERROR_TABLE
$DMERROR_RTTABLE
<end table>
ENDL

# All sector numbers that we feed to the kernel must be in units of 512b, but
# they also must be aligned to the device's logical block size.
logical_block_size=$(_min_dio_alignment $SCRATCH_DEV)
kernel_sectors_per_device_lba=$((logical_block_size / 512))

# Mark as bad one of the device LBAs in the middle of the extent.  Target the
# second LBA of the third block of the four-block file extent that we allocated
# earlier, but without overflowing into the fourth file block.
bad_sector=$(( phys_start + (2 * kernel_sectors_per_fs_block) ))
bad_len=$kernel_sectors_per_device_lba
if (( kernel_sectors_per_device_lba < kernel_sectors_per_fs_block )); then
	bad_sector=$((bad_sector + kernel_sectors_per_device_lba))
fi
if (( (bad_sector % kernel_sectors_per_device_lba) != 0)); then
	echo "bad_sector $bad_sector not congruent with device logical block size $logical_block_size"
fi
_dmerror_mark_range_bad $bad_sector $bad_len $errordev

cat >> $seqres.full << ENDL
dmerror after marking bad:
$DMERROR_TABLE
$DMERROR_RTTABLE
<end table>
ENDL

_dmerror_load_error_table

# See if the media scan picks it up.
echo "Scrub for injected media error (single threaded)"

# Once in single-threaded mode
_scratch_scrub -b -x >> $seqres.full 2> $tmp.error
cat $tmp.error | filter_scrub_errors

# Once in parallel mode
echo "Scrub for injected media error (multi threaded)"
_scratch_scrub -x >> $seqres.full 2> $tmp.error
cat $tmp.error | filter_scrub_errors

# Remount to flush the page cache and reread to see the IO error
_dmerror_unmount
_dmerror_mount
echo "victim file:" >> $seqres.full
od -tx1 -Ad -c $victim >> $seqres.full 2> $tmp.error
cat $tmp.error | sed -e 's/read error: //g' | _filter_scratch

# Scrub again to re-confirm the media error across a remount
echo "Scrub for injected media error (after remount)"
_scratch_scrub -x >> $seqres.full 2> $tmp.error
cat $tmp.error | filter_scrub_errors

# Now mark the bad range good so that a retest shows no media failure.
_dmerror_mark_range_good $bad_sector $bad_len $errordev
_dmerror_load_error_table

cat >> $seqres.full << ENDL
dmerror after marking good:
$DMERROR_TABLE
$DMERROR_RTTABLE
<end table>
ENDL

echo "Scrub after removing injected media error"

# Scrub one last time to make sure the error's gone.
_scratch_scrub -x >> $seqres.full 2> $tmp.error
cat $tmp.error | filter_scrub_errors

# success, all done
status=0
exit
