tools: Fix idf_monitor test case occasional hangup or failure

Closes idf/esp-idf#39
This commit is contained in:
Roland Dobai 2018-11-06 11:36:46 +01:00
parent 64d038c0d5
commit 608fc23278
3 changed files with 128 additions and 77 deletions

View file

@ -547,7 +547,7 @@ test_idf_monitor:
expire_in: 1 week
script:
- cd ${IDF_PATH}/tools/test_idf_monitor
- ${IDF_PATH}/tools/ci/multirun_with_pyenv.sh ./run_test_idf_monitor.py
- ./run_test_idf_monitor.py
test_idf_size:
<<: *host_test_template

View file

@ -373,7 +373,7 @@ class Monitor(object):
else:
raise RuntimeError("Bad event data %r" % ((event_tag,data),))
except SerialStopException:
pass
sys.stderr.write(ANSI_NORMAL + "Stopping condition has been received\n")
finally:
try:
self.console_reader.stop()

View file

@ -19,13 +19,16 @@ from __future__ import unicode_literals
from builtins import object
from io import open
import os
import signal
import sys
import time
import subprocess
import socket
import pty
import filecmp
import threading
test_list = (
# Add new tests here. All files should be placed in in_dir. Columns are:
# Add new tests here. All files should be placed in IN_DIR. Columns are:
# Input file Filter string File with expected output
('in1.txt', '', 'in1f1.txt'),
('in1.txt', '*:V', 'in1f1.txt'),
@ -35,89 +38,137 @@ test_list = (
('in2.txt', 'vfs', 'in2f2.txt'),
)
in_dir = 'tests/' # tests are in this directory
out_dir = 'outputs/' # test results are written to this directory (kept only for debugging purposes)
socat_in = './socatfile'# temporary socat file (deleted after run)
err_out = out_dir + 'monitor_error_output'
elf_file = './dummy.elf' # ELF file used for starting the monitor
idf_monitor = '{}/tools/idf_monitor.py'.format(os.getenv("IDF_PATH"))
IN_DIR = 'tests/' # tests are in this directory
OUT_DIR = 'outputs/' # test results are written to this directory (kept only for debugging purposes)
ERR_OUT = OUT_DIR + 'monitor_error_output'
ELF_FILE = './dummy.elf' # ELF file used for starting the monitor
IDF_MONITOR = '{}/tools/idf_monitor.py'.format(os.getenv("IDF_PATH"))
class SocatRunner(object):
"""
Runs socat in the background for creating a socket.
"""
# connection related to communicating with idf_monitor through sockets
HOST = 'localhost'
# blocking socket operations are used with timeout:
SOCKET_TIMEOUT = 30
# idf_monitor is killed if it doesn't respond:
MONITOR_TIMEOUT = 60
# the test is restarted after failure (idf_monitor has to be killed):
RETRIES_PER_TEST = 5
def monitor_timeout(process):
if process.poll() is None:
# idf_monitor is still running
try:
process.kill()
print('\tidf_monitor was killed because it did not finish in {} seconds.'.format(MONITOR_TIMEOUT))
except OSError as e:
if e.errno == errno.ESRCH:
# ignores a possible race condition which can occur when the process exits between poll() and kill()
pass
else:
raise
class TestRunner(object):
def __enter__(self):
# Wait for a connection on port 2399 and then run "tail" which will send the file content to that port. Tail
# is used because it can start even when the file doesn't exists and remains running after the file has been
# processed. This way the idf_monitor can end the communication when it received the content. Using regular
# "cat" would invoke exception in idf_monitor.
# Note: "-c 1GB" option is used to force sending the whole file under the assumption that all testing files
# will be much smaller than 1G.
# Note: A temporary file socat_in is used in order to be able to start socat only once instead of for each test.
socat_cmd = ['socat',
'-U', # unidirectional pipe from file to port
'TCP4-LISTEN:2399,reuseaddr,fork',
'exec:"tail -c 1GB -F ' + socat_in + '"']
print(' '.join(socat_cmd))
self._socat_process = subprocess.Popen(socat_cmd, preexec_fn=os.setsid) # See __exit__ for os.setsid
self.serversocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
self.serversocket.setsockopt(socket.SOL_TCP, socket.TCP_NODELAY, 1)
self.serversocket.bind((HOST, 0))
self.port = self.serversocket.getsockname()[1]
self.serversocket.listen(5)
return self
def __exit__(self, type, value, traceback):
# self._socat_process.terminate() doesn't enough because each connection to the port starts a new socat and a
# tail processes
os.killpg(os.getpgid(self._socat_process.pid), signal.SIGTERM)
# Note: this terminates all the processes but makes the script UNIX-only
self.serversocket.shutdown(socket.SHUT_RDWR)
self.serversocket.close()
print('Socket was closed successfully')
def cleanup():
def accept_connection(self):
""" returns a socket for sending the input for idf_monitor which must be closed before calling this again. """
(clientsocket, address) = self.serversocket.accept()
# exception will be thrown here if the idf_monitor didn't connect in time
clientsocket.setsockopt(socket.SOL_TCP, socket.TCP_NODELAY, 1)
return clientsocket
def test_iteration(runner, test, startup_timeout):
print('\nRunning test on {} with filter "{}" and expecting {}'.format(test[0], test[1], test[2]))
try:
os.remove(socat_in)
except:
# ignore if the file doesn't exist
pass
with open(OUT_DIR + test[2], "w", encoding='utf-8') as o_f, open(ERR_OUT, "w", encoding='utf-8') as e_f:
monitor_cmd = [sys.executable,
IDF_MONITOR, '--port', 'socket://{}:{}'.format(HOST, runner.port), '--print_filter', test[1], ELF_FILE]
(master_fd, slave_fd) = pty.openpty()
print('\t', ' '.join(monitor_cmd), sep='')
print('\tstdout="{}" stderr="{}" stdin="{}"'.format(o_f.name, e_f.name, os.ttyname(slave_fd)))
start = time.time()
# the server socket is alive so idf_monitor can start now
proc = subprocess.Popen(monitor_cmd, stdin=slave_fd, stdout=o_f, stderr=e_f, close_fds=True, bufsize=-1)
# - idf_monitor's stdin needs to be connected to some pseudo-tty in docker image even when it is not
# used at all
# - setting bufsize is needed because the default value is different on Python 2 and 3
# - the default close_fds is also different on Python 2 and 3
monitor_watchdog = threading.Timer(MONITOR_TIMEOUT, monitor_timeout, [proc])
monitor_watchdog.start()
client = runner.accept_connection()
# The connection is ready but idf_monitor cannot yet receive data. This seems to happen on Ubuntu 16.04 LTS
# and is not related to the version of Python or pyserial. There seems to be no reliable way other than to
# do a sleep. The idf_monitor header in ERR_OUT could be checked on Python 2 but the file is not flushed
# on Python 3.
print('\tSleeping for {:.2f} seconds'.format(startup_timeout))
time.sleep(startup_timeout)
with open(IN_DIR + test[0], "r", encoding='utf-8') as f:
print('\tSending {} to the socket'.format(f.name))
for line in f:
client.sendall(line.encode())
idf_exit_sequence = b'\x1d\n'
print('\tSending <exit> to the socket')
client.sendall(idf_exit_sequence)
ret = proc.wait()
end = time.time()
print('\tidf_monitor exited after {:.2f} seconds'.format(end - start))
if ret < 0:
raise RuntimeError('idf_monitor was terminated by signal {}'.format(-ret))
# idf_monitor needs to end before the socket is closed in order to exit without an exception.
finally:
if monitor_watchdog:
monitor_watchdog.cancel()
os.close(slave_fd)
os.close(master_fd)
if client:
client.close()
print('\tThe client was closed successfully')
f1 = IN_DIR + test[2]
f2 = OUT_DIR + test[2]
print('\tdiff {} {}'.format(f1, f2))
if filecmp.cmp(f1, f2, shallow=False):
print('\tTest has passed')
else:
raise RuntimeError("The contents of the files are different. Please examine the artifacts.")
def main():
start = time.time()
cleanup() # avoid sending old content
if not os.path.exists(out_dir):
os.mkdir(out_dir)
try:
with SocatRunner():
# Sleep is necessary to make sure that socat is already listening. Only one sleep is used per run (this is
# another reason while the temporary socat_in file is used instead of directly reading the test files).
time.sleep(1)
for t in test_list:
print('Running test on {} with filter "{}" and expecting {}'.format(t[0], t[1], t[2]))
with open(in_dir + t[0], "r", encoding='utf-8') as i_f, open(socat_in, "w", encoding='utf-8') as s_f:
print('cat {} > {}'.format(i_f.name, s_f.name))
for line in i_f:
s_f.write(line)
idf_exit_sequence = b'\x1d\n'
print('echo "<exit>" >> {}'.format(s_f.name))
s_f.write(idf_exit_sequence.decode())
monitor_cmd = [idf_monitor,
'--port', 'socket://localhost:2399',
'--print_filter', t[1],
elf_file]
with open(out_dir + t[2], "w", encoding='utf-8') as o_f, open(err_out, "w", encoding='utf-8') as e_f:
try:
(master_fd, slave_fd) = pty.openpty()
print(' '.join(monitor_cmd), end=' ')
print(' > {} 2> {} < {}'.format(o_f.name, e_f.name, os.ttyname(slave_fd)))
proc = subprocess.Popen(monitor_cmd, stdin=slave_fd, stdout=o_f, stderr=e_f,
close_fds=True)
proc.wait()
finally:
os.close(slave_fd)
os.close(master_fd)
diff_cmd = ['diff', in_dir + t[2], out_dir + t[2]]
print(' '.join(diff_cmd))
subprocess.check_call(diff_cmd)
print('Test has passed')
finally:
cleanup()
gstart = time.time()
if not os.path.exists(OUT_DIR):
os.mkdir(OUT_DIR)
end = time.time()
print('Execution took {:.2f} seconds'.format(end - start))
socket.setdefaulttimeout(SOCKET_TIMEOUT)
for test in test_list:
startup_timeout = 1.0
for i in range(RETRIES_PER_TEST):
with TestRunner() as runner:
# Each test (and each retry) is run with a different port (and server socket). This is done for
# the CI run where retry with a different socket is necessary to pass the test. According to the
# experiments, retry with the same port (and server socket) is not sufficient.
try:
test_iteration(runner, test, startup_timeout)
# no more retries if test_iteration exited without an exception
break
except Exception as e:
if i < RETRIES_PER_TEST - 1:
print('Test has failed with exception:', e)
print('Another attempt will be made.')
startup_timeout += 0.5
else:
raise
gend = time.time()
print('Execution took {:.2f} seconds\n'.format(gend - gstart))
if __name__ == "__main__":
main()