Merge branch 'bugfix/test_idf_monitor_hangs' into 'master'

tools: Fix idf_monitor test case occasional hangup or failure

Closes #39

See merge request idf/esp-idf!3651
This commit is contained in:
Angus Gratton 2018-11-26 15:29:55 +08:00
commit 03c59658cb
3 changed files with 128 additions and 77 deletions

View file

@ -547,7 +547,7 @@ test_idf_monitor:
expire_in: 1 week expire_in: 1 week
script: script:
- cd ${IDF_PATH}/tools/test_idf_monitor - cd ${IDF_PATH}/tools/test_idf_monitor
- ${IDF_PATH}/tools/ci/multirun_with_pyenv.sh ./run_test_idf_monitor.py - ./run_test_idf_monitor.py
test_idf_size: test_idf_size:
<<: *host_test_template <<: *host_test_template

View file

@ -373,7 +373,7 @@ class Monitor(object):
else: else:
raise RuntimeError("Bad event data %r" % ((event_tag,data),)) raise RuntimeError("Bad event data %r" % ((event_tag,data),))
except SerialStopException: except SerialStopException:
pass sys.stderr.write(ANSI_NORMAL + "Stopping condition has been received\n")
finally: finally:
try: try:
self.console_reader.stop() self.console_reader.stop()

View file

@ -19,13 +19,16 @@ from __future__ import unicode_literals
from builtins import object from builtins import object
from io import open from io import open
import os import os
import signal import sys
import time import time
import subprocess import subprocess
import socket
import pty import pty
import filecmp
import threading
test_list = ( test_list = (
# Add new tests here. All files should be placed in in_dir. Columns are: # Add new tests here. All files should be placed in IN_DIR. Columns are:
# Input file Filter string File with expected output # Input file Filter string File with expected output
('in1.txt', '', 'in1f1.txt'), ('in1.txt', '', 'in1f1.txt'),
('in1.txt', '*:V', 'in1f1.txt'), ('in1.txt', '*:V', 'in1f1.txt'),
@ -35,89 +38,137 @@ test_list = (
('in2.txt', 'vfs', 'in2f2.txt'), ('in2.txt', 'vfs', 'in2f2.txt'),
) )
in_dir = 'tests/' # tests are in this directory IN_DIR = 'tests/' # tests are in this directory
out_dir = 'outputs/' # test results are written to this directory (kept only for debugging purposes) OUT_DIR = 'outputs/' # test results are written to this directory (kept only for debugging purposes)
socat_in = './socatfile'# temporary socat file (deleted after run) ERR_OUT = OUT_DIR + 'monitor_error_output'
err_out = out_dir + 'monitor_error_output' ELF_FILE = './dummy.elf' # ELF file used for starting the monitor
elf_file = './dummy.elf' # ELF file used for starting the monitor IDF_MONITOR = '{}/tools/idf_monitor.py'.format(os.getenv("IDF_PATH"))
idf_monitor = '{}/tools/idf_monitor.py'.format(os.getenv("IDF_PATH"))
class SocatRunner(object): # connection related to communicating with idf_monitor through sockets
""" HOST = 'localhost'
Runs socat in the background for creating a socket. # blocking socket operations are used with timeout:
""" SOCKET_TIMEOUT = 30
# idf_monitor is killed if it doesn't respond:
MONITOR_TIMEOUT = 60
# the test is restarted after failure (idf_monitor has to be killed):
RETRIES_PER_TEST = 5
def monitor_timeout(process):
if process.poll() is None:
# idf_monitor is still running
try:
process.kill()
print('\tidf_monitor was killed because it did not finish in {} seconds.'.format(MONITOR_TIMEOUT))
except OSError as e:
if e.errno == errno.ESRCH:
# ignores a possible race condition which can occur when the process exits between poll() and kill()
pass
else:
raise
class TestRunner(object):
def __enter__(self): def __enter__(self):
# Wait for a connection on port 2399 and then run "tail" which will send the file content to that port. Tail self.serversocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
# is used because it can start even when the file doesn't exists and remains running after the file has been self.serversocket.setsockopt(socket.SOL_TCP, socket.TCP_NODELAY, 1)
# processed. This way the idf_monitor can end the communication when it received the content. Using regular self.serversocket.bind((HOST, 0))
# "cat" would invoke exception in idf_monitor. self.port = self.serversocket.getsockname()[1]
# Note: "-c 1GB" option is used to force sending the whole file under the assumption that all testing files self.serversocket.listen(5)
# will be much smaller than 1G.
# Note: A temporary file socat_in is used in order to be able to start socat only once instead of for each test.
socat_cmd = ['socat',
'-U', # unidirectional pipe from file to port
'TCP4-LISTEN:2399,reuseaddr,fork',
'exec:"tail -c 1GB -F ' + socat_in + '"']
print(' '.join(socat_cmd))
self._socat_process = subprocess.Popen(socat_cmd, preexec_fn=os.setsid) # See __exit__ for os.setsid
return self return self
def __exit__(self, type, value, traceback): def __exit__(self, type, value, traceback):
# self._socat_process.terminate() doesn't enough because each connection to the port starts a new socat and a self.serversocket.shutdown(socket.SHUT_RDWR)
# tail processes self.serversocket.close()
os.killpg(os.getpgid(self._socat_process.pid), signal.SIGTERM) print('Socket was closed successfully')
# Note: this terminates all the processes but makes the script UNIX-only
def cleanup(): def accept_connection(self):
""" returns a socket for sending the input for idf_monitor which must be closed before calling this again. """
(clientsocket, address) = self.serversocket.accept()
# exception will be thrown here if the idf_monitor didn't connect in time
clientsocket.setsockopt(socket.SOL_TCP, socket.TCP_NODELAY, 1)
return clientsocket
def test_iteration(runner, test, startup_timeout):
print('\nRunning test on {} with filter "{}" and expecting {}'.format(test[0], test[1], test[2]))
try: try:
os.remove(socat_in) with open(OUT_DIR + test[2], "w", encoding='utf-8') as o_f, open(ERR_OUT, "w", encoding='utf-8') as e_f:
except: monitor_cmd = [sys.executable,
# ignore if the file doesn't exist IDF_MONITOR, '--port', 'socket://{}:{}'.format(HOST, runner.port), '--print_filter', test[1], ELF_FILE]
pass (master_fd, slave_fd) = pty.openpty()
print('\t', ' '.join(monitor_cmd), sep='')
print('\tstdout="{}" stderr="{}" stdin="{}"'.format(o_f.name, e_f.name, os.ttyname(slave_fd)))
start = time.time()
# the server socket is alive so idf_monitor can start now
proc = subprocess.Popen(monitor_cmd, stdin=slave_fd, stdout=o_f, stderr=e_f, close_fds=True, bufsize=-1)
# - idf_monitor's stdin needs to be connected to some pseudo-tty in docker image even when it is not
# used at all
# - setting bufsize is needed because the default value is different on Python 2 and 3
# - the default close_fds is also different on Python 2 and 3
monitor_watchdog = threading.Timer(MONITOR_TIMEOUT, monitor_timeout, [proc])
monitor_watchdog.start()
client = runner.accept_connection()
# The connection is ready but idf_monitor cannot yet receive data. This seems to happen on Ubuntu 16.04 LTS
# and is not related to the version of Python or pyserial. There seems to be no reliable way other than to
# do a sleep. The idf_monitor header in ERR_OUT could be checked on Python 2 but the file is not flushed
# on Python 3.
print('\tSleeping for {:.2f} seconds'.format(startup_timeout))
time.sleep(startup_timeout)
with open(IN_DIR + test[0], "r", encoding='utf-8') as f:
print('\tSending {} to the socket'.format(f.name))
for line in f:
client.sendall(line.encode())
idf_exit_sequence = b'\x1d\n'
print('\tSending <exit> to the socket')
client.sendall(idf_exit_sequence)
ret = proc.wait()
end = time.time()
print('\tidf_monitor exited after {:.2f} seconds'.format(end - start))
if ret < 0:
raise RuntimeError('idf_monitor was terminated by signal {}'.format(-ret))
# idf_monitor needs to end before the socket is closed in order to exit without an exception.
finally:
if monitor_watchdog:
monitor_watchdog.cancel()
os.close(slave_fd)
os.close(master_fd)
if client:
client.close()
print('\tThe client was closed successfully')
f1 = IN_DIR + test[2]
f2 = OUT_DIR + test[2]
print('\tdiff {} {}'.format(f1, f2))
if filecmp.cmp(f1, f2, shallow=False):
print('\tTest has passed')
else:
raise RuntimeError("The contents of the files are different. Please examine the artifacts.")
def main(): def main():
start = time.time() gstart = time.time()
cleanup() # avoid sending old content if not os.path.exists(OUT_DIR):
if not os.path.exists(out_dir): os.mkdir(OUT_DIR)
os.mkdir(out_dir)
try:
with SocatRunner():
# Sleep is necessary to make sure that socat is already listening. Only one sleep is used per run (this is
# another reason while the temporary socat_in file is used instead of directly reading the test files).
time.sleep(1)
for t in test_list:
print('Running test on {} with filter "{}" and expecting {}'.format(t[0], t[1], t[2]))
with open(in_dir + t[0], "r", encoding='utf-8') as i_f, open(socat_in, "w", encoding='utf-8') as s_f:
print('cat {} > {}'.format(i_f.name, s_f.name))
for line in i_f:
s_f.write(line)
idf_exit_sequence = b'\x1d\n'
print('echo "<exit>" >> {}'.format(s_f.name))
s_f.write(idf_exit_sequence.decode())
monitor_cmd = [idf_monitor,
'--port', 'socket://localhost:2399',
'--print_filter', t[1],
elf_file]
with open(out_dir + t[2], "w", encoding='utf-8') as o_f, open(err_out, "w", encoding='utf-8') as e_f:
try:
(master_fd, slave_fd) = pty.openpty()
print(' '.join(monitor_cmd), end=' ')
print(' > {} 2> {} < {}'.format(o_f.name, e_f.name, os.ttyname(slave_fd)))
proc = subprocess.Popen(monitor_cmd, stdin=slave_fd, stdout=o_f, stderr=e_f,
close_fds=True)
proc.wait()
finally:
os.close(slave_fd)
os.close(master_fd)
diff_cmd = ['diff', in_dir + t[2], out_dir + t[2]]
print(' '.join(diff_cmd))
subprocess.check_call(diff_cmd)
print('Test has passed')
finally:
cleanup()
end = time.time() socket.setdefaulttimeout(SOCKET_TIMEOUT)
print('Execution took {:.2f} seconds'.format(end - start))
for test in test_list:
startup_timeout = 1.0
for i in range(RETRIES_PER_TEST):
with TestRunner() as runner:
# Each test (and each retry) is run with a different port (and server socket). This is done for
# the CI run where retry with a different socket is necessary to pass the test. According to the
# experiments, retry with the same port (and server socket) is not sufficient.
try:
test_iteration(runner, test, startup_timeout)
# no more retries if test_iteration exited without an exception
break
except Exception as e:
if i < RETRIES_PER_TEST - 1:
print('Test has failed with exception:', e)
print('Another attempt will be made.')
startup_timeout += 0.5
else:
raise
gend = time.time()
print('Execution took {:.2f} seconds\n'.format(gend - gstart))
if __name__ == "__main__": if __name__ == "__main__":
main() main()