Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SHiP: tests for restarting from corrupted log and index files #983

Merged
merged 6 commits into from
Nov 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/full-version-label.sh ${CMAKE_CURRENT
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/nodeos_producer_watermark_test.py ${CMAKE_CURRENT_BINARY_DIR}/nodeos_producer_watermark_test.py COPYONLY)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cli_test.py ${CMAKE_CURRENT_BINARY_DIR}/cli_test.py COPYONLY)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/ship_reqs_across_svnn_test.py ${CMAKE_CURRENT_BINARY_DIR}/ship_reqs_across_svnn_test.py COPYONLY)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/ship_restart_test.py ${CMAKE_CURRENT_BINARY_DIR}/ship_restart_test.py COPYONLY)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/ship_test.py ${CMAKE_CURRENT_BINARY_DIR}/ship_test.py COPYONLY)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/ship_streamer_test.py ${CMAKE_CURRENT_BINARY_DIR}/ship_streamer_test.py COPYONLY)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/ship_kill_client_test.py ${CMAKE_CURRENT_BINARY_DIR}/ship_kill_client_test.py COPYONLY)
Expand Down Expand Up @@ -176,6 +177,8 @@ set_property(TEST production_restart PROPERTY LABELS nonparallelizable_tests)

add_test(NAME ship_reqs_across_svnn_test COMMAND tests/ship_reqs_across_svnn_test.py -v ${UNSHARE} WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
set_property(TEST ship_reqs_across_svnn_test PROPERTY LABELS nonparallelizable_tests)
add_test(NAME ship_restart_test COMMAND tests/ship_restart_test.py -v ${UNSHARE} WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
set_property(TEST ship_restart_test PROPERTY LABELS nonparallelizable_tests)
add_test(NAME ship_test COMMAND tests/ship_test.py -v --num-clients 10 --num-requests 5000 ${UNSHARE} WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
set_property(TEST ship_test PROPERTY LABELS nonparallelizable_tests)
add_test(NAME ship_test_unix COMMAND tests/ship_test.py -v --num-clients 10 --num-requests 5000 ${UNSHARE} --unix-socket WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
Expand Down
6 changes: 3 additions & 3 deletions tests/TestHarness/testUtils.py
Original file line number Diff line number Diff line change
Expand Up @@ -514,9 +514,9 @@ def compare(obj1,obj2,context):
return "comparison of %s type is not supported, context=%s" % (typeName,context)

@staticmethod
def compareFiles(file1: str, file2: str):
f1 = open(file1)
f2 = open(file2)
def compareFiles(file1: str, file2: str, mode="r"):
f1 = open(file1, mode)
f2 = open(file2, mode)

i = 0
same = True
Expand Down
285 changes: 285 additions & 0 deletions tests/ship_restart_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,285 @@
#!/usr/bin/env python3

import os
import tempfile
import shutil
import signal

from TestHarness import Cluster, TestHelper, Utils, WalletMgr

###############################################################################
# ship_restart_test
#
# This test verifies SHiP shuts down gracefully or recovers when restarting
# with various scenarios of corrupted log and/or index files.
#
###############################################################################

Print=Utils.Print

args = TestHelper.parse_args({"--dump-error-details","--keep-logs","-v","--leave-running","--unshared"})

Utils.Debug=args.v
cluster=Cluster(unshared=args.unshared, keepRunning=args.leave_running, keepLogs=args.keep_logs)
dumpErrorDetails=args.dump_error_details
walletPort=TestHelper.DEFAULT_WALLET_PORT

totalProducerNodes=1
totalNonProducerNodes=1 # for SHiP node
totalNodes=totalProducerNodes+totalNonProducerNodes

walletMgr=WalletMgr(True, port=walletPort)
testSuccessful=False

prodNodeId = 0
shipNodeId = 1

origStateHistoryLog = ""
stateHistoryLog = ""
origStateHistoryIndex = ""
stateHistoryIndex = ""

# Verifies that SHiP should fail to restart with a corrupted first entry header
def corruptedHeaderTest(pos, corruptedValue, shipNode):
# restore log and index
shutil.copyfile(origStateHistoryLog, stateHistoryLog)
shutil.copyfile(origStateHistoryIndex, stateHistoryIndex)

with open(stateHistoryLog, 'rb+') as f: # opened as binary file
f.seek(pos) # seek to the position to corrupt
f.write(corruptedValue) # corrupt it

isRelaunchSuccess = shipNode.relaunch()
assert not isRelaunchSuccess, "SHiP node should have failed to relaunch"

try:
TestHelper.printSystemInfo("BEGIN")

cluster.setWalletMgr(walletMgr)
Print("Stand up cluster")

specificExtraNodeosArgs={}
specificExtraNodeosArgs[shipNodeId]="--plugin eosio::state_history_plugin --trace-history --chain-state-history --finality-data-history --state-history-stride 200 --plugin eosio::net_api_plugin --plugin eosio::producer_api_plugin"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does the chain ever make it past block 200 to exercise anything other then the simple "flat log" case? But I suppose there is probably not much that can be done for a corrupted non-head split log anyways -- fixing it just makes a hole that isn't allowed.

I do wonder if we should do some additional corruption tests with pruned logs. But those are even less repairable.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does the chain ever make it past block 200 to exercise anything other then the simple "flat log" case? But I suppose there is probably not much that can be done for a corrupted non-head split log anyways -- fixing it just makes a hole that isn't allowed.

I do wonder if we should do some additional corruption tests with pruned logs. But those are even less repairable.

Thanks. I created #1011 to track additional tests for corrupted pruned and split tests. I think they should be done as a part of the bigger initiative to revisit how corrupted logs are handled (repairing at the startup or offline via spring-util.

I am merging the PR in current form so we have some basic tests.


if cluster.launch(topo="mesh", pnodes=totalProducerNodes, totalNodes=totalNodes,
activateIF=True,
specificExtraNodeosArgs=specificExtraNodeosArgs) is False:
Utils.cmdError("launcher")
Utils.errorExit("Failed to stand up cluster.")

# Verify nodes are in sync and advancing
cluster.waitOnClusterSync(blockAdvancing=5)
Print("Cluster in Sync")

Print("Shutdown unneeded bios node")
cluster.biosNode.kill(signal.SIGTERM)

prodNode = cluster.getNode(prodNodeId)
shipNode = cluster.getNode(shipNodeId)

Print("Shutdown producer and SHiP nodes")
prodNode.kill(signal.SIGTERM)
shipNode.kill(signal.SIGTERM)

shipDir = os.path.join(Utils.getNodeDataDir(shipNodeId), "state-history")
stateHistoryLog = os.path.join(shipDir, "chain_state_history.log")
stateHistoryIndex = os.path.join(shipDir, "chain_state_history.index")
tmpDir = tempfile.mkdtemp()
origStateHistoryLog = os.path.join(tmpDir, "chain_state_history.log")
origStateHistoryIndex = os.path.join(tmpDir, "chain_state_history.index")

# save original chain_state_history log and index files
Print("Save original SHiP log and index")
shutil.copyfile(stateHistoryLog, origStateHistoryLog)
shutil.copyfile(stateHistoryIndex, origStateHistoryIndex)

############## Part 1: tests while producer node is down #################

#-------- Index file is removed. It should be regenerated at restart.
Print("index file removed test")

os.remove(stateHistoryIndex)

isRelaunchSuccess = shipNode.relaunch()
assert isRelaunchSuccess, "Failed to relaunch shipNode"
assert Utils.compareFiles(stateHistoryLog, origStateHistoryLog, mode="rb") # log unchanged
assert Utils.compareFiles(stateHistoryIndex, origStateHistoryIndex, mode="rb") # index regenerated

shipNode.kill(signal.SIGTERM) # shut down ship node for next test

'''
Test failure 1: index file was not regenerated. Reenable this after https://github.com/AntelopeIO/spring/issues/990 is fixed.

#-------- Index file last entry is corrupted. It should be regenerated at restart.
with open(stateHistoryIndex, 'rb+') as stateHistoryIndexFile: # opened as binary file
# seek to last entry (8 bytes before the end of file)
stateHistoryIndexFile.seek(-8, 2) # -8 for backward, 2 for starting at end

# set the index to a random value
stateHistoryIndexFile.write(b'\x00\x01\x02\x03\x04\x05\x06\x07')

isRelaunchSuccess = shipNode.relaunch()
assert isRelaunchSuccess, "Failed to relaunch shipNode"
assert Utils.compareFiles(stateHistoryLog, origStateHistoryLog, mode="rb")
assert Utils.compareFiles(stateHistoryIndex, origStateHistoryIndex, mode="rb")
'''

#-------- Truncate index file. It should be regenerated
# because index size is not the same as expected size
Print("Truncated index file test")

# restore log and index
shutil.copyfile(origStateHistoryLog, stateHistoryLog)
shutil.copyfile(origStateHistoryIndex, stateHistoryIndex)

with open(stateHistoryIndex, 'rb+') as f:
indexFileSize = os.path.getsize(stateHistoryIndex)
newSize = indexFileSize - 8 # truncate 8 bytes
f.truncate(newSize)

isRelaunchSuccess = shipNode.relaunch()
assert isRelaunchSuccess, "Failed to relaunch shipNode"
assert Utils.compareFiles(stateHistoryLog, origStateHistoryLog, mode="rb") # log file unchanged
assert Utils.compareFiles(stateHistoryIndex, origStateHistoryIndex, mode="rb") # index file regenerated

shipNode.kill(signal.SIGTERM) # shut down it for next test

#-------- Add an extra entry to index file. It should be regenerated
# because index size is not the same as expected size
Print("Extra entry in index file test")

# restore log and index
shutil.copyfile(origStateHistoryLog, stateHistoryLog)
shutil.copyfile(origStateHistoryIndex, stateHistoryIndex)

with open(stateHistoryIndex, 'rb+') as stateHistoryIndexFile: # opened as binary file
stateHistoryIndexFile.seek(0, 2) # seek to end of file
stateHistoryIndexFile.write(b'\x00\x00\x00\x00\x00\x00\x01\x0F') # write a small value

isRelaunchSuccess = shipNode.relaunch()
assert isRelaunchSuccess, "Failed to relaunch shipNode"
assert Utils.compareFiles(stateHistoryLog, origStateHistoryLog, mode="rb") # log file not changed
assert Utils.compareFiles(stateHistoryIndex, origStateHistoryIndex, mode="rb") # index file regenerated

shipNode.kill(signal.SIGTERM) # shut down it for next test

#-------- Remove log file. The log file should be reconstructed from state
# and restart succeeds
Print("Removed log file test")

shutil.copyfile(origStateHistoryIndex, stateHistoryIndex)

os.remove(stateHistoryLog)

isRelaunchSuccess = shipNode.relaunch()
assert isRelaunchSuccess, "Failed to relaunch shipNode"

shipNode.kill(signal.SIGTERM) # shut down it for next test

#-------- Corrupt first entry's magic. Relaunch should fail
Print("first entry magic corruption test")
corruptedHeaderTest(0, b'\x00\x01\x02\x03\x04\x05\x06\x07', shipNode) # 0 is magic's position

#-------- Corrupt first entry's block_id. Relaunch should fail
Print("first entry block_id corruption test")
corruptedHeaderTest(8, b'\x00\x01\x02\x03\x04\x05\x06\x07', shipNode) # 8 is block_id's position

'''
# Test failure 2: Reenable this after https://github.com/AntelopeIO/spring/issues/989 is fixed.
#-------- Corrupt last entry's position . It should be repaired.
# After producer node restarts, head on SHiP node should advance.
Print("last entry postion corruption test")

shutil.copyfile(origStateHistoryLog, stateHistoryLog)
shutil.copyfile(origStateHistoryIndex, stateHistoryIndex)

with open(stateHistoryLog, 'rb+') as stateHistoryLogFile: # opened as binary file
# seek to last index (8 bytes before the end of file)
stateHistoryLogFile.seek(-8, 2) # -8 for backward, 2 for starting at end

# set the index to a random value
stateHistoryLogFile.write(b'\x00\x01\x02\x03\x04\x05\x06\x07')

isRelaunchSuccess = shipNode.relaunch()
assert isRelaunchSuccess, "Failed to relaunch shipNode"
isRelaunchSuccess = prodNode.relaunch(chainArg="--enable-stale-production")
assert isRelaunchSuccess, "Failed to relaunch prodNode"

assert shipNode.waitForHeadToAdvance(), "Head did not advance on shipNode"
prodNode.kill(signal.SIGTERM)
shipNode.kill(signal.SIGTERM)
'''

'''
# Test failure 3: Reenable this after https://github.com/AntelopeIO/spring/issues/989 is fixed.
#-------- Corrupt last entry's header. It should be repaired.
# After producer node restarts, head on SHiP node should advance.
Print("last entry header corruption test")

shutil.copyfile(origStateHistoryLog, stateHistoryLog)
shutil.copyfile(origStateHistoryIndex, stateHistoryIndex)

with open(stateHistoryLog, 'rb+') as f: # opened as binary file
# seek to last index (8 bytes before the end of file)
f.seek(-8, 2) # -8 for backward, 2 for starting at end

data = f.read(8)
integer_value = int.from_bytes(data, byteorder='little')
f.seek(integer_value)

# corrupt the header
f.write(b'\x00\x01\x02\x03\x04\x05\x06\x07')

isRelaunchSuccess = shipNode.relaunch()
assert isRelaunchSuccess, "Failed to relaunch shipNode"
isRelaunchSuccess = prodNode.relaunch(chainArg="--enable-stale-production")
assert isRelaunchSuccess, "Failed to relaunch prodNode"

assert shipNode.waitForHeadToAdvance(), "Head did not advance on shipNode"
prodNode.kill(signal.SIGTERM)
shipNode.kill(signal.SIGTERM)
'''

############## Part 2: tests while producer node is up #################

isRelaunchSuccess = prodNode.relaunch(chainArg="--enable-stale-production")
assert isRelaunchSuccess, "Failed to relaunch prodNode"

shutil.copyfile(origStateHistoryLog, stateHistoryLog)
shutil.copyfile(origStateHistoryIndex, stateHistoryIndex)

#-------- Index file is removed. It should be regenerated at restart
Print("Index file removed while producer node is up test")

os.remove(stateHistoryIndex)

isRelaunchSuccess = shipNode.relaunch()
assert isRelaunchSuccess, "Failed to relaunch shipNode"
assert shipNode.waitForHeadToAdvance(), "Head did not advance on shipNode"

shipNode.kill(signal.SIGTERM) # shut down it for next test

'''
# Test failure 4: Reenable this after issue https://github.com/AntelopeIO/spring/issues/989 fixed.
#-------- Corrupt last entry of log file. It should be repaired
# and head should advance
with open(stateHistoryLog, 'rb+') as stateHistoryLogFile: # opened as binary file
# seek to last index, 8 bytes before the end of file
stateHistoryLogFile.seek(-8, 2) # -8 for backward, 2 for starting at end

# set the index to a random value
stateHistoryLogFile.write(b'\x00\x01\x02\x03\x04\x05\x06\x07')

isRelaunchSuccess = shipNode.relaunch()
assert isRelaunchSuccess, "Failed to relaunch shipNode"
assert shipNode.waitForHeadToAdvance(), "Head did not advance on shipNode"
'''

testSuccessful = True
finally:
TestHelper.shutdown(cluster, walletMgr, testSuccessful=testSuccessful, dumpErrorDetails=dumpErrorDetails)
if tmpDir is not None:
shutil.rmtree(tmpDir, ignore_errors=True)

errorCode = 0 if testSuccessful else 1
exit(errorCode)