Merge branch 'bugfix/test_idf_monitor_hangs' into 'master'

tools: Fix idf_monitor test case occasional hangup or failure Closes #39 See merge request idf/esp-idf!3651
2018-11-26 15:29:55 +08:00 · 2018-11-26 15:29:55 +08:00 · 03c59658cb
parent c765035b3a 608fc23278
commit 03c59658cb
3 changed files with 128 additions and 77 deletions
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@ -547,7 +547,7 @@ test_idf_monitor:
    expire_in: 1 week
  script:
    - cd ${IDF_PATH}/tools/test_idf_monitor
-    - ${IDF_PATH}/tools/ci/multirun_with_pyenv.sh ./run_test_idf_monitor.py
+    - ./run_test_idf_monitor.py
 test_idf_size:
  <<: *host_test_template
--- a/tools/idf_monitor.py
+++ b/tools/idf_monitor.py
@ -373,7 +373,7 @@ class Monitor(object):
                else:
                    raise RuntimeError("Bad event data %r" % ((event_tag,data),))
        except SerialStopException:
-            pass
+            sys.stderr.write(ANSI_NORMAL + "Stopping condition has been received\n")
        finally:
            try:
                self.console_reader.stop()
--- a/tools/test_idf_monitor/run_test_idf_monitor.py
+++ b/tools/test_idf_monitor/run_test_idf_monitor.py
@ -19,13 +19,16 @@ from __future__ import unicode_literals
 from builtins import object
 from io import open
 import os
-import signal
+import sys
 import time
 import subprocess
 import socket
 import pty
 import filecmp
 import threading
 test_list = (
-        # Add new tests here. All files should be placed in in_dir. Columns are:
+        # Add new tests here. All files should be placed in IN_DIR. Columns are:
        # Input file            Filter string                                               File with expected output
        ('in1.txt',             '',                                                         'in1f1.txt'),
        ('in1.txt',             '*:V',                                                      'in1f1.txt'),
@ -35,89 +38,137 @@ test_list = (
        ('in2.txt',             'vfs',                                                      'in2f2.txt'),
        )
-in_dir = 'tests/'       # tests are in this directory
+IN_DIR = 'tests/'       # tests are in this directory
-out_dir = 'outputs/'    # test results are written to this directory (kept only for debugging purposes)
+OUT_DIR = 'outputs/'    # test results are written to this directory (kept only for debugging purposes)
-socat_in = './socatfile'# temporary socat file (deleted after run)
+ERR_OUT = OUT_DIR + 'monitor_error_output'
-err_out = out_dir + 'monitor_error_output'
+ELF_FILE = './dummy.elf'  # ELF file used for starting the monitor
-elf_file = './dummy.elf'  # ELF file used for starting the monitor
+IDF_MONITOR = '{}/tools/idf_monitor.py'.format(os.getenv("IDF_PATH"))
 idf_monitor = '{}/tools/idf_monitor.py'.format(os.getenv("IDF_PATH"))
-class SocatRunner(object):
+# connection related to communicating with idf_monitor through sockets
-    """
+HOST = 'localhost'
-    Runs socat in the background for creating a socket.
+# blocking socket operations are used with timeout:
-    """
+SOCKET_TIMEOUT = 30
 # idf_monitor is killed if it doesn't respond:
 MONITOR_TIMEOUT = 60
 # the test is restarted after failure (idf_monitor has to be killed):
 RETRIES_PER_TEST = 5
 def monitor_timeout(process):
    if process.poll() is None:
        # idf_monitor is still running
        try:
            process.kill()
            print('\tidf_monitor was killed because it did not finish in {} seconds.'.format(MONITOR_TIMEOUT))
        except OSError as e:
            if e.errno == errno.ESRCH:
                # ignores a possible race condition which can occur when the process exits between poll() and kill()
                pass
            else:
                raise
 class TestRunner(object):
    def __enter__(self):
-        # Wait for a connection on port 2399 and then run "tail" which will send the file content to that port. Tail
+        self.serversocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-        # is used because it can start even when the file doesn't exists and remains running after the file has been
+        self.serversocket.setsockopt(socket.SOL_TCP, socket.TCP_NODELAY, 1)
-        # processed. This way the idf_monitor can end the communication when it received the content. Using regular
+        self.serversocket.bind((HOST, 0))
-        # "cat" would invoke exception in idf_monitor.
+        self.port = self.serversocket.getsockname()[1]
-        # Note: "-c 1GB" option is used to force sending the whole file under the assumption that all testing files
+        self.serversocket.listen(5)
        # will be much smaller than 1G.
        # Note: A temporary file socat_in is used in order to be able to start socat only once instead of for each test.
        socat_cmd = ['socat',
                '-U', # unidirectional pipe from file to port
                'TCP4-LISTEN:2399,reuseaddr,fork',
                'exec:"tail -c 1GB -F ' + socat_in + '"']
        print(' '.join(socat_cmd))
        self._socat_process = subprocess.Popen(socat_cmd, preexec_fn=os.setsid) # See __exit__ for os.setsid
        return self
    def __exit__(self, type, value, traceback):
-        # self._socat_process.terminate() doesn't enough because each connection to the port starts a new socat and a
+        self.serversocket.shutdown(socket.SHUT_RDWR)
-        # tail processes
+        self.serversocket.close()
-        os.killpg(os.getpgid(self._socat_process.pid), signal.SIGTERM)
+        print('Socket was closed successfully')
        # Note: this terminates all the processes but makes the script UNIX-only
-def cleanup():
+    def accept_connection(self):
        """ returns a socket for sending the input for idf_monitor which must be closed before calling this again. """
        (clientsocket, address) = self.serversocket.accept()
        # exception will be thrown here if the idf_monitor didn't connect in time
        clientsocket.setsockopt(socket.SOL_TCP, socket.TCP_NODELAY, 1)
        return clientsocket
 def test_iteration(runner, test, startup_timeout):
    print('\nRunning test on {} with filter "{}" and expecting {}'.format(test[0], test[1], test[2]))
    try:
-        os.remove(socat_in)
+        with open(OUT_DIR + test[2], "w", encoding='utf-8') as o_f, open(ERR_OUT, "w", encoding='utf-8') as e_f:
-    except:
+            monitor_cmd = [sys.executable,
-        # ignore if the file doesn't exist
+                    IDF_MONITOR, '--port', 'socket://{}:{}'.format(HOST, runner.port), '--print_filter', test[1], ELF_FILE]
-        pass
+            (master_fd, slave_fd) = pty.openpty()
            print('\t', ' '.join(monitor_cmd), sep='')
            print('\tstdout="{}" stderr="{}" stdin="{}"'.format(o_f.name, e_f.name, os.ttyname(slave_fd)))
            start = time.time()
            # the server socket is alive so idf_monitor can start now
            proc = subprocess.Popen(monitor_cmd, stdin=slave_fd, stdout=o_f, stderr=e_f, close_fds=True, bufsize=-1)
            # - idf_monitor's stdin needs to be connected to some pseudo-tty in docker image even when it is not
            #   used at all
            # - setting bufsize is needed because the default value is different on Python 2 and 3
            # - the default close_fds is also different on Python 2 and 3
            monitor_watchdog = threading.Timer(MONITOR_TIMEOUT, monitor_timeout, [proc])
            monitor_watchdog.start()
            client = runner.accept_connection()
            # The connection is ready but idf_monitor cannot yet receive data. This seems to happen on Ubuntu 16.04 LTS
            # and is not related to the version of Python or pyserial. There seems to be no reliable way other than to
            # do a sleep. The idf_monitor header in ERR_OUT could be checked on Python 2 but the file is not flushed
            # on Python 3.
            print('\tSleeping for {:.2f} seconds'.format(startup_timeout))
            time.sleep(startup_timeout)
            with open(IN_DIR + test[0], "r", encoding='utf-8') as f:
                print('\tSending {} to the socket'.format(f.name))
                for line in f:
                    client.sendall(line.encode())
            idf_exit_sequence = b'\x1d\n'
            print('\tSending <exit> to the socket')
            client.sendall(idf_exit_sequence)
            ret = proc.wait()
            end = time.time()
            print('\tidf_monitor exited after {:.2f} seconds'.format(end - start))
            if ret < 0:
                raise RuntimeError('idf_monitor was terminated by signal {}'.format(-ret))
            # idf_monitor needs to end before the socket is closed in order to exit without an exception.
    finally:
        if monitor_watchdog:
            monitor_watchdog.cancel()
        os.close(slave_fd)
        os.close(master_fd)
        if client:
            client.close()
        print('\tThe client was closed successfully')
    f1 = IN_DIR + test[2]
    f2 = OUT_DIR + test[2]
    print('\tdiff {} {}'.format(f1, f2))
    if filecmp.cmp(f1, f2, shallow=False):
        print('\tTest has passed')
    else:
        raise RuntimeError("The contents of the files are different. Please examine the artifacts.")
 def main():
-    start = time.time()
+    gstart = time.time()
-    cleanup() # avoid sending old content
+    if not os.path.exists(OUT_DIR):
-    if not os.path.exists(out_dir):
+        os.mkdir(OUT_DIR)
        os.mkdir(out_dir)
    try:
        with SocatRunner():
            # Sleep is necessary to make sure that socat is already listening. Only one sleep is used per run (this is
            # another reason while the temporary socat_in file is used instead of directly reading the test files).
            time.sleep(1)
            for t in test_list:
                print('Running test on {} with filter "{}" and expecting {}'.format(t[0], t[1], t[2]))
                with open(in_dir + t[0], "r", encoding='utf-8') as i_f, open(socat_in, "w", encoding='utf-8') as s_f:
                    print('cat {} > {}'.format(i_f.name, s_f.name))
                    for line in i_f:
                        s_f.write(line)
                    idf_exit_sequence = b'\x1d\n'
                    print('echo "<exit>" >> {}'.format(s_f.name))
                    s_f.write(idf_exit_sequence.decode())
                monitor_cmd = [idf_monitor,
                        '--port', 'socket://localhost:2399',
                        '--print_filter', t[1],
                        elf_file]
                with open(out_dir + t[2], "w", encoding='utf-8') as o_f, open(err_out, "w", encoding='utf-8') as e_f:
                    try:
                        (master_fd, slave_fd) = pty.openpty()
                        print(' '.join(monitor_cmd), end=' ')
                        print(' > {} 2> {} < {}'.format(o_f.name, e_f.name, os.ttyname(slave_fd)))
                        proc = subprocess.Popen(monitor_cmd, stdin=slave_fd, stdout=o_f, stderr=e_f,
                                close_fds=True)
                        proc.wait()
                    finally:
                        os.close(slave_fd)
                        os.close(master_fd)
                diff_cmd = ['diff', in_dir + t[2], out_dir + t[2]]
                print(' '.join(diff_cmd))
                subprocess.check_call(diff_cmd)
                print('Test has passed')
    finally:
        cleanup()
-    end = time.time()
+    socket.setdefaulttimeout(SOCKET_TIMEOUT)
-    print('Execution took {:.2f} seconds'.format(end - start))
+
    for test in test_list:
        startup_timeout = 1.0
        for i in range(RETRIES_PER_TEST):
            with TestRunner() as runner:
                # Each test (and each retry) is run with a different port (and server socket). This is done for
                # the CI run where retry with a different socket is necessary to pass the test. According to the
                # experiments, retry with the same port (and server socket) is not sufficient.
                try:
                    test_iteration(runner, test, startup_timeout)
                    # no more retries if test_iteration exited without an exception
                    break
                except Exception as e:
                    if i < RETRIES_PER_TEST - 1:
                        print('Test has failed with exception:', e)
                        print('Another attempt will be made.')
                        startup_timeout += 0.5
                    else:
                        raise
    gend = time.time()
    print('Execution took {:.2f} seconds\n'.format(gend - gstart))
 if __name__ == "__main__":
    main()