tools: Fix idf_monitor test case occasional hangup or failure

Closes idf/esp-idf#39
2018-11-06 11:36:46 +01:00 · 2018-11-06 11:36:46 +01:00 · 608fc23278
parent 64d038c0d5
commit 608fc23278
3 changed files with 128 additions and 77 deletions
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@ -547,7 +547,7 @@ test_idf_monitor:
    expire_in: 1 week
  script:
    - cd ${IDF_PATH}/tools/test_idf_monitor
-    - ${IDF_PATH}/tools/ci/multirun_with_pyenv.sh ./run_test_idf_monitor.py
+    - ./run_test_idf_monitor.py

 test_idf_size:
  <<: *host_test_template
--- a/tools/idf_monitor.py
+++ b/tools/idf_monitor.py
@ -373,7 +373,7 @@ class Monitor(object):
                else:
                    raise RuntimeError("Bad event data %r" % ((event_tag,data),))
        except SerialStopException:
-            pass
+            sys.stderr.write(ANSI_NORMAL + "Stopping condition has been received\n")
        finally:
            try:
                self.console_reader.stop()
--- a/tools/test_idf_monitor/run_test_idf_monitor.py
+++ b/tools/test_idf_monitor/run_test_idf_monitor.py
@ -19,13 +19,16 @@ from __future__ import unicode_literals
 from builtins import object
 from io import open
 import os
-import signal
+import sys
 import time
 import subprocess
+import socket
 import pty
+import filecmp
+import threading

 test_list = (
-        # Add new tests here. All files should be placed in in_dir. Columns are:
+        # Add new tests here. All files should be placed in IN_DIR. Columns are:
        # Input file            Filter string                                               File with expected output
        ('in1.txt',             '',                                                         'in1f1.txt'),
        ('in1.txt',             '*:V',                                                      'in1f1.txt'),
@ -35,89 +38,137 @@ test_list = (
        ('in2.txt',             'vfs',                                                      'in2f2.txt'),
        )

-in_dir = 'tests/'       # tests are in this directory
-out_dir = 'outputs/'    # test results are written to this directory (kept only for debugging purposes)
-socat_in = './socatfile'# temporary socat file (deleted after run)
-err_out = out_dir + 'monitor_error_output'
-elf_file = './dummy.elf'  # ELF file used for starting the monitor
-idf_monitor = '{}/tools/idf_monitor.py'.format(os.getenv("IDF_PATH"))
+IN_DIR = 'tests/'       # tests are in this directory
+OUT_DIR = 'outputs/'    # test results are written to this directory (kept only for debugging purposes)
+ERR_OUT = OUT_DIR + 'monitor_error_output'
+ELF_FILE = './dummy.elf'  # ELF file used for starting the monitor
+IDF_MONITOR = '{}/tools/idf_monitor.py'.format(os.getenv("IDF_PATH"))

-class SocatRunner(object):
-    """
-    Runs socat in the background for creating a socket.
-    """
+# connection related to communicating with idf_monitor through sockets
+HOST = 'localhost'
+# blocking socket operations are used with timeout:
+SOCKET_TIMEOUT = 30
+# idf_monitor is killed if it doesn't respond:
+MONITOR_TIMEOUT = 60
+# the test is restarted after failure (idf_monitor has to be killed):
+RETRIES_PER_TEST = 5
+
+def monitor_timeout(process):
+    if process.poll() is None:
+        # idf_monitor is still running
+        try:
+            process.kill()
+            print('\tidf_monitor was killed because it did not finish in {} seconds.'.format(MONITOR_TIMEOUT))
+        except OSError as e:
+            if e.errno == errno.ESRCH:
+                # ignores a possible race condition which can occur when the process exits between poll() and kill()
+                pass
+            else:
+                raise
+
+class TestRunner(object):
    def __enter__(self):
-        # Wait for a connection on port 2399 and then run "tail" which will send the file content to that port. Tail
-        # is used because it can start even when the file doesn't exists and remains running after the file has been
-        # processed. This way the idf_monitor can end the communication when it received the content. Using regular
-        # "cat" would invoke exception in idf_monitor.
-        # Note: "-c 1GB" option is used to force sending the whole file under the assumption that all testing files
-        # will be much smaller than 1G.
-        # Note: A temporary file socat_in is used in order to be able to start socat only once instead of for each test.
-        socat_cmd = ['socat',
-                '-U', # unidirectional pipe from file to port
-                'TCP4-LISTEN:2399,reuseaddr,fork',
-                'exec:"tail -c 1GB -F ' + socat_in + '"']
-        print(' '.join(socat_cmd))
-        self._socat_process = subprocess.Popen(socat_cmd, preexec_fn=os.setsid) # See __exit__ for os.setsid
+        self.serversocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+        self.serversocket.setsockopt(socket.SOL_TCP, socket.TCP_NODELAY, 1)
+        self.serversocket.bind((HOST, 0))
+        self.port = self.serversocket.getsockname()[1]
+        self.serversocket.listen(5)
        return self

    def __exit__(self, type, value, traceback):
-        # self._socat_process.terminate() doesn't enough because each connection to the port starts a new socat and a
-        # tail processes
-        os.killpg(os.getpgid(self._socat_process.pid), signal.SIGTERM)
-        # Note: this terminates all the processes but makes the script UNIX-only
+        self.serversocket.shutdown(socket.SHUT_RDWR)
+        self.serversocket.close()
+        print('Socket was closed successfully')

-def cleanup():
+    def accept_connection(self):
+        """ returns a socket for sending the input for idf_monitor which must be closed before calling this again. """
+        (clientsocket, address) = self.serversocket.accept()
+        # exception will be thrown here if the idf_monitor didn't connect in time
+        clientsocket.setsockopt(socket.SOL_TCP, socket.TCP_NODELAY, 1)
+        return clientsocket
+
+def test_iteration(runner, test, startup_timeout):
+    print('\nRunning test on {} with filter "{}" and expecting {}'.format(test[0], test[1], test[2]))
    try:
-        os.remove(socat_in)
-    except:
-        # ignore if the file doesn't exist
-        pass
+        with open(OUT_DIR + test[2], "w", encoding='utf-8') as o_f, open(ERR_OUT, "w", encoding='utf-8') as e_f:
+            monitor_cmd = [sys.executable,
+                    IDF_MONITOR, '--port', 'socket://{}:{}'.format(HOST, runner.port), '--print_filter', test[1], ELF_FILE]
+            (master_fd, slave_fd) = pty.openpty()
+            print('\t', ' '.join(monitor_cmd), sep='')
+            print('\tstdout="{}" stderr="{}" stdin="{}"'.format(o_f.name, e_f.name, os.ttyname(slave_fd)))
+            start = time.time()
+            # the server socket is alive so idf_monitor can start now
+            proc = subprocess.Popen(monitor_cmd, stdin=slave_fd, stdout=o_f, stderr=e_f, close_fds=True, bufsize=-1)
+            # - idf_monitor's stdin needs to be connected to some pseudo-tty in docker image even when it is not
+            #   used at all
+            # - setting bufsize is needed because the default value is different on Python 2 and 3
+            # - the default close_fds is also different on Python 2 and 3
+            monitor_watchdog = threading.Timer(MONITOR_TIMEOUT, monitor_timeout, [proc])
+            monitor_watchdog.start()
+            client = runner.accept_connection()
+            # The connection is ready but idf_monitor cannot yet receive data. This seems to happen on Ubuntu 16.04 LTS
+            # and is not related to the version of Python or pyserial. There seems to be no reliable way other than to
+            # do a sleep. The idf_monitor header in ERR_OUT could be checked on Python 2 but the file is not flushed
+            # on Python 3.
+            print('\tSleeping for {:.2f} seconds'.format(startup_timeout))
+            time.sleep(startup_timeout)
+            with open(IN_DIR + test[0], "r", encoding='utf-8') as f:
+                print('\tSending {} to the socket'.format(f.name))
+                for line in f:
+                    client.sendall(line.encode())
+            idf_exit_sequence = b'\x1d\n'
+            print('\tSending <exit> to the socket')
+            client.sendall(idf_exit_sequence)
+            ret = proc.wait()
+            end = time.time()
+            print('\tidf_monitor exited after {:.2f} seconds'.format(end - start))
+            if ret < 0:
+                raise RuntimeError('idf_monitor was terminated by signal {}'.format(-ret))
+            # idf_monitor needs to end before the socket is closed in order to exit without an exception.
+    finally:
+        if monitor_watchdog:
+            monitor_watchdog.cancel()
+        os.close(slave_fd)
+        os.close(master_fd)
+        if client:
+            client.close()
+        print('\tThe client was closed successfully')
+    f1 = IN_DIR + test[2]
+    f2 = OUT_DIR + test[2]
+    print('\tdiff {} {}'.format(f1, f2))
+    if filecmp.cmp(f1, f2, shallow=False):
+        print('\tTest has passed')
+    else:
+        raise RuntimeError("The contents of the files are different. Please examine the artifacts.")

 def main():
-    start = time.time()
-    cleanup() # avoid sending old content
-    if not os.path.exists(out_dir):
-        os.mkdir(out_dir)
-    try:
-        with SocatRunner():
-            # Sleep is necessary to make sure that socat is already listening. Only one sleep is used per run (this is
-            # another reason while the temporary socat_in file is used instead of directly reading the test files).
-            time.sleep(1)
-            for t in test_list:
-                print('Running test on {} with filter "{}" and expecting {}'.format(t[0], t[1], t[2]))
-                with open(in_dir + t[0], "r", encoding='utf-8') as i_f, open(socat_in, "w", encoding='utf-8') as s_f:
-                    print('cat {} > {}'.format(i_f.name, s_f.name))
-                    for line in i_f:
-                        s_f.write(line)
-                    idf_exit_sequence = b'\x1d\n'
-                    print('echo "<exit>" >> {}'.format(s_f.name))
-                    s_f.write(idf_exit_sequence.decode())
-                monitor_cmd = [idf_monitor,
-                        '--port', 'socket://localhost:2399',
-                        '--print_filter', t[1],
-                        elf_file]
-                with open(out_dir + t[2], "w", encoding='utf-8') as o_f, open(err_out, "w", encoding='utf-8') as e_f:
-                    try:
-                        (master_fd, slave_fd) = pty.openpty()
-                        print(' '.join(monitor_cmd), end=' ')
-                        print(' > {} 2> {} < {}'.format(o_f.name, e_f.name, os.ttyname(slave_fd)))
-                        proc = subprocess.Popen(monitor_cmd, stdin=slave_fd, stdout=o_f, stderr=e_f,
-                                close_fds=True)
-                        proc.wait()
-                    finally:
-                        os.close(slave_fd)
-                        os.close(master_fd)
-                diff_cmd = ['diff', in_dir + t[2], out_dir + t[2]]
-                print(' '.join(diff_cmd))
-                subprocess.check_call(diff_cmd)
-                print('Test has passed')
-    finally:
-        cleanup()
+    gstart = time.time()
+    if not os.path.exists(OUT_DIR):
+        os.mkdir(OUT_DIR)

-    end = time.time()
-    print('Execution took {:.2f} seconds'.format(end - start))
+    socket.setdefaulttimeout(SOCKET_TIMEOUT)
+
+    for test in test_list:
+        startup_timeout = 1.0
+        for i in range(RETRIES_PER_TEST):
+            with TestRunner() as runner:
+                # Each test (and each retry) is run with a different port (and server socket). This is done for
+                # the CI run where retry with a different socket is necessary to pass the test. According to the
+                # experiments, retry with the same port (and server socket) is not sufficient.
+                try:
+                    test_iteration(runner, test, startup_timeout)
+                    # no more retries if test_iteration exited without an exception
+                    break
+                except Exception as e:
+                    if i < RETRIES_PER_TEST - 1:
+                        print('Test has failed with exception:', e)
+                        print('Another attempt will be made.')
+                        startup_timeout += 0.5
+                    else:
+                        raise
+
+    gend = time.time()
+    print('Execution took {:.2f} seconds\n'.format(gend - gstart))

 if __name__ == "__main__":
    main()