spi_master: add new polling mode to decrease time cost each transaction

2018-01-31 11:15:23 +08:00 · 2018-01-31 11:15:23 +08:00 · 4af51833f3
parent 3004744f2c
commit 4af51833f3
7 changed files with 1314 additions and 468 deletions
--- a/components/driver/include/driver/spi_master.h
+++ b/components/driver/include/driver/spi_master.h
@ -71,9 +71,9 @@ typedef struct {
    uint8_t cs_ena_pretrans;        ///< Amount of SPI bit-cycles the cs should be activated before the transmission (0-16). This only works on half-duplex transactions.
    uint8_t cs_ena_posttrans;       ///< Amount of SPI bit-cycles the cs should stay active after the transmission (0-16)
    int clock_speed_hz;             ///< Clock speed, divisors of 80MHz, in Hz. See ``SPI_MASTER_FREQ_*``.
-    int input_delay_ns;             /**< Maximum data valid time of slave. The time required between SCLK and MISO 
-        valid, including the possible clock delay from slave to master. The driver uses this value to give an extra 
-        delay before the MISO is ready on the line. Leave at 0 unless you know you need a delay. For better timing 
+    int input_delay_ns;             /**< Maximum data valid time of slave. The time required between SCLK and MISO
+        valid, including the possible clock delay from slave to master. The driver uses this value to give an extra
+        delay before the MISO is ready on the line. Leave at 0 unless you know you need a delay. For better timing
        performance at high frequency (over 8MHz), it's suggest to have the right value.
        */
    int spics_io_num;               ///< CS GPIO pin for this device, or -1 if not used
@ -206,7 +206,10 @@ esp_err_t spi_bus_remove_device(spi_device_handle_t handle);


 /**
- * @brief Queue a SPI transaction for execution
+ * @brief Queue a SPI transaction for interrupt transaction execution. Get the result by ``spi_device_get_trans_result``.
+ *
+ * @note Normally a device cannot start (queue) polling and interrupt
+ *      transactions simultaneously.
 *
 * @param handle Device handle obtained using spi_host_add_dev
 * @param trans_desc Description of transaction to execute
@ -216,16 +219,17 @@ esp_err_t spi_bus_remove_device(spi_device_handle_t handle);
 *         - ESP_ERR_INVALID_ARG   if parameter is invalid
 *         - ESP_ERR_TIMEOUT       if there was no room in the queue before ticks_to_wait expired
 *         - ESP_ERR_NO_MEM        if allocating DMA-capable temporary buffer failed
+ *         - ESP_ERR_INVALID_STATE if previous transactions are not finished
 *         - ESP_OK                on success
 */
 esp_err_t spi_device_queue_trans(spi_device_handle_t handle, spi_transaction_t *trans_desc, TickType_t ticks_to_wait);


 /**
- * @brief Get the result of a SPI transaction queued earlier
+ * @brief Get the result of a SPI transaction queued earlier by ``spi_device_queue_trans``.
 *
- * This routine will wait until a transaction to the given device (queued earlier with
- * spi_device_queue_trans) has succesfully completed. It will then return the description of the
+ * This routine will wait until a transaction to the given device
+ * succesfully completed. It will then return the description of the
 * completed transaction so software can inspect the result and e.g. free the memory or
 * re-use the buffers.
 *
@ -247,10 +251,11 @@ esp_err_t spi_device_get_trans_result(spi_device_handle_t handle, spi_transactio
 * @brief Send a SPI transaction, wait for it to complete, and return the result
 *
 * This function is the equivalent of calling spi_device_queue_trans() followed by spi_device_get_trans_result().
- * Do not use this when there is still a transaction separately queued from spi_device_queue_trans() that hasn't been finalized
- * using spi_device_get_trans_result().
+ * Do not use this when there is still a transaction separately queued (started) from spi_device_queue_trans() or polling_start/transmit that hasn't been finalized.
 *
 * @note This function is not thread safe when multiple tasks access the same SPI device.
+ *      Normally a device cannot start (queue) polling and interrupt
+ *      transactions simutanuously.
 *
 * @param handle Device handle obtained using spi_host_add_dev
 * @param trans_desc Description of transaction to execute
@ -260,6 +265,90 @@ esp_err_t spi_device_get_trans_result(spi_device_handle_t handle, spi_transactio
 */
 esp_err_t spi_device_transmit(spi_device_handle_t handle, spi_transaction_t *trans_desc);

+
+/**
+ * @brief Immediately start a polling transaction.
+ *
+ * @note Normally a device cannot start (queue) polling and interrupt
+ *      transactions simutanuously. Moreover, a device cannot start a new polling
+ *      transaction if another polling transaction is not finished.
+ *
+ * @param handle Device handle obtained using spi_host_add_dev
+ * @param trans_desc Description of transaction to execute
+ * @param ticks_to_wait Ticks to wait until there's room in the queue;
+ *              currently only portMAX_DELAY is supported.
+ *
+ * @return
+ *         - ESP_ERR_INVALID_ARG   if parameter is invalid
+ *         - ESP_ERR_TIMEOUT       if the device cannot get control of the bus before ``ticks_to_wait`` expired
+ *         - ESP_ERR_NO_MEM        if allocating DMA-capable temporary buffer failed
+ *         - ESP_ERR_INVALID_STATE if previous transactions are not finished
+ *         - ESP_OK                on success
+ */
+esp_err_t spi_device_polling_start(spi_device_handle_t handle, spi_transaction_t *trans_desc, TickType_t ticks_to_wait);
+
+
+/**
+ * @brief Poll until the polling transaction ends.
+ *
+ * This routine will not return until the transaction to the given device has
+ * succesfully completed. The task is not blocked, but actively busy-spins for
+ * the transaction to be completed.
+ *
+ * @param handle Device handle obtained using spi_host_add_dev
+ * @param ticks_to_wait Ticks to wait until there's a returned item; use portMAX_DELAY to never time
+                        out.
+ * @return
+ *         - ESP_ERR_INVALID_ARG   if parameter is invalid
+ *         - ESP_ERR_TIMEOUT       if the transaction cannot finish before ticks_to_wait expired
+ *         - ESP_OK                on success
+ */
+esp_err_t spi_device_polling_end(spi_device_handle_t handle, TickType_t ticks_to_wait);
+
+
+/**
+ * @brief Send a polling transaction, wait for it to complete, and return the result
+ *
+ * This function is the equivalent of calling spi_device_polling_start() followed by spi_device_polling_end().
+ * Do not use this when there is still a transaction that hasn't been finalized.
+ *
+ * @note This function is not thread safe when multiple tasks access the same SPI device.
+ *      Normally a device cannot start (queue) polling and interrupt
+ *      transactions simutanuously.
+ *
+ * @param handle Device handle obtained using spi_host_add_dev
+ * @param trans_desc Description of transaction to execute
+ * @return
+ *         - ESP_ERR_INVALID_ARG   if parameter is invalid
+ *         - ESP_OK                on success
+ */
+esp_err_t spi_device_polling_transmit(spi_device_handle_t handle, spi_transaction_t *trans_desc);
+
+
+/**
+ * @brief Occupy the SPI bus for a device to do continuous transactions.
+ *
+ * Transactions to all other devices will be put off until ``spi_device_release_bus`` is called.
+ *
+ * @note The function will wait until all the existing transactions have been sent.
+ *
+ * @param device The device to occupy the bus.
+ * @param wait Time to wait before the the bus is occupied by the device. Currently MUST set to portMAX_DELAY.
+ *
+ * @return
+ *      - ESP_ERR_INVALID_ARG : ``wait`` is not set to portMAX_DELAY.
+ *      - ESP_OK : Success.
+ */
+esp_err_t spi_device_acquire_bus(spi_device_handle_t device, TickType_t wait);
+
+/**
+ * @brief Release the SPI bus occupied by the device. All other devices can start sending transactions.
+ *
+ * @param dev The device to release the bus.
+ */
+void spi_device_release_bus(spi_device_handle_t dev);
+
+
 /**
 * @brief Calculate the working frequency that is most close to desired frequency, and also the register value.
 *
@ -282,6 +371,7 @@ int spi_cal_clock(int fapb, int hz, int duty_cycle, uint32_t* reg_o);
  *         - -1 If too many cycles remaining, suggest to compensate half a clock.
  *         - 0 If no remaining cycles or dummy bits are not used.
  *         - positive value: cycles suggest to compensate.
+  *
  * @note If **dummy_o* is not zero, it means dummy bits should be applied in half duplex mode, and full duplex mode may not work.
  */
 void spi_get_timing(bool gpio_is_used, int input_delay_ns, int eff_clk, int* dummy_o, int* cycles_remain_o);
@ -290,6 +380,7 @@ void spi_get_timing(bool gpio_is_used, int input_delay_ns, int eff_clk, int* dum
  * @brief Get the frequency limit of current configurations.
  *         SPI master working at this limit is OK, while above the limit, full duplex mode and DMA will not work,
  *         and dummy bits will be aplied in the half duplex mode.
+  *
  * @param gpio_is_used True if using GPIO matrix, or False if native pins are used.
  * @param input_delay_ns Input delay from SCLK launch edge to MISO data valid.
  * @return Frequency limit of current configurations.
--- a/components/driver/spi_common.c
+++ b/components/driver/spi_common.c
@ -31,6 +31,7 @@
 #include "driver/periph_ctrl.h"
 #include "esp_heap_caps.h"
 #include "driver/spi_common.h"
+#include "stdatomic.h"

 static const char *SPI_TAG = "spi";

@ -50,7 +51,7 @@ typedef struct spi_device_t spi_device_t;
 #define DMA_CHANNEL_ENABLED(dma_chan)    (BIT(dma_chan-1))

 //Periph 1 is 'claimed' by SPI flash code.
-static bool spi_periph_claimed[3] = {true, false, false};
+static atomic_bool spi_periph_claimed[3] = { ATOMIC_VAR_INIT(true), ATOMIC_VAR_INIT(false), ATOMIC_VAR_INIT(false)};
 static uint8_t spi_dma_chan_enabled = 0;
 static portMUX_TYPE spi_dma_spinlock = portMUX_INITIALIZER_UNLOCKED;

@ -58,7 +59,8 @@ static portMUX_TYPE spi_dma_spinlock = portMUX_INITIALIZER_UNLOCKED;
 //Returns true if this peripheral is successfully claimed, false if otherwise.
 bool spicommon_periph_claim(spi_host_device_t host)
 {
-    bool ret = __sync_bool_compare_and_swap(&spi_periph_claimed[host], false, true);
+    bool false_var = false;
+    bool ret = atomic_compare_exchange_strong(&spi_periph_claimed[host], &false_var, true);
    if (ret) periph_module_enable(spi_periph_signal[host].module);
    return ret;
 }
@ -66,7 +68,8 @@ bool spicommon_periph_claim(spi_host_device_t host)
 //Returns true if this peripheral is successfully freed, false if otherwise.
 bool spicommon_periph_free(spi_host_device_t host)
 {
-    bool ret = __sync_bool_compare_and_swap(&spi_periph_claimed[host], true, false);
+    bool true_var = true;
+    bool ret = atomic_compare_exchange_strong(&spi_periph_claimed[host], &true_var, false);
    if (ret) periph_module_disable(spi_periph_signal[host].module);
    return ret;
 }
--- a/components/driver/spi_master.c
+++ b/components/driver/spi_master.c
--- a/components/driver/test/test_spi_master.c
+++ b/components/driver/test/test_spi_master.c
@ -23,6 +23,7 @@
 #include "freertos/ringbuf.h"
 #include "soc/gpio_periph.h"
 #include "sdkconfig.h"
+#include "unity_config.h"

 const static char TAG[] = "test_spi";

@ -1335,6 +1336,7 @@ TEST_CASE("spi_speed","[spi]")
    uint32_t t_flight;
    //to get rid of the influence of randomly interrupts, we measured the performance by median value
    uint32_t t_flight_sorted[TEST_TIMES];
+    esp_err_t ret;
    int t_flight_num = 0;

    spi_device_handle_t spi;
@ -1347,9 +1349,6 @@ TEST_CASE("spi_speed","[spi]")
    //first work with DMA
    speed_setup(&spi, use_dma);

-    //first time introduces a device switch, which costs more time. we skip this
-    spi_device_transmit(spi, &trans);
-
    //record flight time by isr, with DMA
    t_flight_num = 0;
    for (int i = 0; i < TEST_TIMES; i++) {
@ -1364,15 +1363,34 @@ TEST_CASE("spi_speed","[spi]")
        ESP_LOGI(TAG, "%d", t_flight_sorted[i]);
    }

+    //acquire the bus to send polling transactions faster
+    ret = spi_device_acquire_bus(spi, portMAX_DELAY);
+    TEST_ESP_OK(ret);
+
+    //record flight time by polling and with DMA
+    t_flight_num = 0;
+    for (int i = 0; i < TEST_TIMES; i++) {
+        spi_device_polling_transmit(spi, &trans); // prime the flash cache
+        RECORD_TIME_START();
+        spi_device_polling_transmit(spi, &trans);
+        RECORD_TIME_END(&t_flight);
+        sorted_array_insert(t_flight_sorted, &t_flight_num, t_flight);
+    }
+    TEST_PERFORMANCE_LESS_THAN(SPI_PER_TRANS_POLLING, "%d us", t_flight_sorted[(TEST_TIMES+1)/2]);
+    for (int i = 0; i < TEST_TIMES; i++) {
+        ESP_LOGI(TAG, "%d", t_flight_sorted[i]);
+    }
+
+    //release the bus
+    spi_device_release_bus(spi);
+
    speed_deinit(spi);
    speed_setup(&spi, !use_dma);

-    //first time introduces a device switch, which costs more time. we skip this
-    spi_device_transmit(spi, &trans);
-
    //record flight time by isr, without DMA
    t_flight_num = 0;
    for (int i = 0; i < TEST_TIMES; i++) {
+        spi_device_transmit(spi, &trans); // prime the flash cache
        RECORD_TIME_START();
        spi_device_transmit(spi, &trans);
        RECORD_TIME_END(&t_flight);
@ -1382,6 +1400,165 @@ TEST_CASE("spi_speed","[spi]")
    for (int i = 0; i < TEST_TIMES; i++) {
        ESP_LOGI(TAG, "%d", t_flight_sorted[i]);
    }
+
+    //acquire the bus to send polling transactions faster
+    ret = spi_device_acquire_bus(spi, portMAX_DELAY);
+    TEST_ESP_OK(ret);
+    //record flight time by polling, without DMA
+    t_flight_num = 0;
+    for (int i = 0; i < TEST_TIMES; i++) {
+        spi_device_polling_transmit(spi, &trans); // prime the flash cache
+        RECORD_TIME_START();
+        spi_device_polling_transmit(spi, &trans);
+        RECORD_TIME_END(&t_flight);
+        sorted_array_insert(t_flight_sorted, &t_flight_num, t_flight);
+    }
+    TEST_PERFORMANCE_LESS_THAN(SPI_PER_TRANS_POLLING_NO_DMA, "%d us", t_flight_sorted[(TEST_TIMES+1)/2]);
+    for (int i = 0; i < TEST_TIMES; i++) {
+        ESP_LOGI(TAG, "%d", t_flight_sorted[i]);
+    }
+
+    //release the bus
+    spi_device_release_bus(spi);
    speed_deinit(spi);
 }

+typedef struct {
+    spi_device_handle_t handle;
+    bool finished;
+} task_context_t;
+
+void spi_task1(void* arg)
+{
+    //task1 send 50 polling transactions, acquire the bus and send another 50
+    int count=0;
+    spi_transaction_t t = {
+        .flags = SPI_TRANS_USE_TXDATA,
+        .tx_data = { 0x80, 0x12, 0x34, 0x56 },
+        .length = 4*8,
+    };
+    spi_device_handle_t handle = ((task_context_t*)arg)->handle;
+    for( int j = 0; j < 50; j ++ ) {
+        TEST_ESP_OK(spi_device_polling_transmit( handle, &t ));
+        ESP_LOGI( TAG, "task1:%d", count++ );
+    }
+    TEST_ESP_OK(spi_device_acquire_bus( handle, portMAX_DELAY ));
+    for( int j = 0; j < 50; j ++ ) {
+        TEST_ESP_OK(spi_device_polling_transmit( handle, &t ));
+        ESP_LOGI( TAG, "task1:%d", count++ );
+    }
+    spi_device_release_bus(handle);
+    ESP_LOGI(TAG, "task1 terminates");
+    ((task_context_t*)arg)->finished = true;
+    vTaskDelete(NULL);
+}
+
+void spi_task2(void* arg)
+{
+    int count=0;
+    //task2 acquire the bus, send 50 polling transactions and then 50 non-polling
+    spi_transaction_t t = {
+        .flags = SPI_TRANS_USE_TXDATA,
+        .tx_data = { 0x80, 0x12, 0x34, 0x56 },
+        .length = 4*8,
+    };
+    spi_transaction_t *ret_t;
+    spi_device_handle_t handle = ((task_context_t*)arg)->handle;
+    TEST_ESP_OK(spi_device_acquire_bus( handle, portMAX_DELAY ));
+
+    for (int i = 0; i < 50; i ++) {
+        TEST_ESP_OK(spi_device_polling_transmit(handle, &t));
+        ESP_LOGI( TAG, "task2: %d", count++ );
+    }
+
+    for( int j = 0; j < 50; j ++ ) {
+        TEST_ESP_OK(spi_device_queue_trans( handle, &t, portMAX_DELAY ));
+    }
+    for( int j = 0; j < 50; j ++ ) {
+        TEST_ESP_OK(spi_device_get_trans_result(handle, &ret_t, portMAX_DELAY));
+        assert(ret_t == &t);
+        ESP_LOGI( TAG, "task2: %d", count++ );
+    }
+    spi_device_release_bus(handle);
+    vTaskDelay(1);
+    ESP_LOGI(TAG, "task2 terminates");
+    ((task_context_t*)arg)->finished = true;
+    vTaskDelete(NULL);
+}
+
+void spi_task3(void* arg)
+{
+    //task3 send 30 polling transactions, acquire the bus, send 20 polling transactions and then 50 non-polling
+    int count=0;
+    spi_transaction_t t = {
+        .flags = SPI_TRANS_USE_TXDATA,
+        .tx_data = { 0x80, 0x12, 0x34, 0x56 },
+        .length = 4*8,
+    };
+    spi_transaction_t *ret_t;
+    spi_device_handle_t handle = ((task_context_t*)arg)->handle;
+
+    for (int i = 0; i < 30; i ++) {
+        TEST_ESP_OK(spi_device_polling_transmit(handle, &t));
+        ESP_LOGI( TAG, "task3: %d", count++ );
+    }
+
+    TEST_ESP_OK(spi_device_acquire_bus( handle, portMAX_DELAY ));
+    for (int i = 0; i < 20; i ++) {
+        TEST_ESP_OK(spi_device_polling_transmit(handle, &t));
+        ESP_LOGI( TAG, "task3: %d", count++ );
+    }
+
+    for (int j = 0; j < 50; j++) {
+        TEST_ESP_OK(spi_device_queue_trans(handle, &t, portMAX_DELAY));
+    }
+    for (int j = 0; j < 50; j++) {
+        TEST_ESP_OK(spi_device_get_trans_result(handle, &ret_t, portMAX_DELAY));
+        assert(ret_t == &t);
+        ESP_LOGI(TAG, "task3: %d", count++);
+    }
+    spi_device_release_bus(handle);
+
+    ESP_LOGI(TAG, "task3 terminates");
+    ((task_context_t*)arg)->finished = true;
+    vTaskDelete(NULL);
+}
+
+TEST_CASE("spi poll tasks","[spi]")
+{
+    task_context_t context1={};
+    task_context_t context2={};
+    task_context_t context3={};
+    TaskHandle_t task1, task2, task3;
+    esp_err_t ret;
+    spi_bus_config_t buscfg=SPI_BUS_TEST_DEFAULT_CONFIG();
+    spi_device_interface_config_t devcfg=SPI_DEVICE_TEST_DEFAULT_CONFIG();
+    devcfg.queue_size = 100;
+
+    //Initialize the SPI bus and 3 devices
+    ret=spi_bus_initialize(HSPI_HOST, &buscfg, 1);
+    TEST_ASSERT(ret==ESP_OK);
+    ret=spi_bus_add_device(HSPI_HOST, &devcfg, &context1.handle);
+    TEST_ASSERT(ret==ESP_OK);
+    ret=spi_bus_add_device(HSPI_HOST, &devcfg, &context2.handle);
+    TEST_ASSERT(ret==ESP_OK);
+    ret=spi_bus_add_device(HSPI_HOST, &devcfg, &context3.handle);
+    TEST_ASSERT(ret==ESP_OK);
+
+    xTaskCreate( spi_task1, "task1", 2048, &context1, 0, &task1 );
+    xTaskCreate( spi_task2, "task2", 2048, &context2, 0, &task2 );
+    xTaskCreate( spi_task3, "task3", 2048, &context3, 0, &task3 );
+
+    for(;;){
+        vTaskDelay(10);
+        if (context1.finished && context2.finished && context3.finished) break;
+    }
+
+    TEST_ESP_OK( spi_bus_remove_device(context1.handle) );
+    TEST_ESP_OK( spi_bus_remove_device(context2.handle) );
+    TEST_ESP_OK( spi_bus_remove_device(context3.handle) );
+    TEST_ESP_OK( spi_bus_free(HSPI_HOST) );
+}
+
+
+//TODO: add a case when a non-polling transaction happened in the bus-acquiring time and then release the bus then queue a new trans
--- a/components/idf_test/include/idf_performance.h
+++ b/components/idf_test/include/idf_performance.h
@ -16,6 +16,8 @@
 #define IDF_PERFORMANCE_MAX_ESP_TIMER_GET_TIME_PER_CALL                         1000
 #define IDF_PERFORMANCE_MAX_SPI_PER_TRANS_NO_POLLING                            30
 #define IDF_PERFORMANCE_MAX_SPI_PER_TRANS_NO_POLLING_NO_DMA                     27
+#define IDF_PERFORMANCE_MAX_SPI_PER_TRANS_POLLING                               15
+#define IDF_PERFORMANCE_MAX_SPI_PER_TRANS_POLLING_NO_DMA                        15
 /* Due to code size & linker layout differences interacting with cache, VFS
   microbenchmark currently runs slower with PSRAM enabled. */
 #define IDF_PERFORMANCE_MAX_VFS_OPEN_WRITE_CLOSE_TIME                           20000
--- a/docs/en/api-reference/peripherals/spi_master.rst
+++ b/docs/en/api-reference/peripherals/spi_master.rst
@ -15,7 +15,16 @@ The spi_master driver

 The spi_master driver allows easy communicating with SPI slave devices, even in a multithreaded environment.
 It fully transparently handles DMA transfers to read and write data and automatically takes care of
-multiplexing between different SPI slaves on the same master
+multiplexing between different SPI slaves on the same master.
+
+.. note::
+
+    **Notes about thread safety**
+
+    The SPI driver API is thread safe when multiple SPI devices on the same bus are accessed from different tasks. However, the driver is not thread safe if the same SPI device is accessed from multiple tasks.
+
+    In this case, it is recommended to either refactor your application so only a single task accesses each SPI device, or to add mutex locking around access of the shared device.
+

 Terminology
 ^^^^^^^^^^^
@ -45,7 +54,6 @@ The spi_master driver uses the following terms:
  CS going inactive again. Transactions are atomic, as in they will never be interrupted by another
  transaction.

-
 SPI transactions
 ^^^^^^^^^^^^^^^^

@ -73,6 +81,154 @@ Something similar is true for the read and write phase: not every transaction ne
 as well as data to be read. When ``rx_buffer`` is NULL (and SPI_USE_RXDATA) is not set) the read phase
 is skipped. When ``tx_buffer`` is NULL (and SPI_USE_TXDATA) is not set) the write phase is skipped.

+The driver offers two different kinds of transactions: the interrupt
+transactions and the polling transactions. Each device can choose one kind of
+transaction to send. See :ref:`mixed_transactions` if your device do require
+both kinds of transactions.
+
+.. _interrupt_transactions:
+
+Interrupt transactions
+""""""""""""""""""""""""
+
+The interrupt transactions use an interrupt-driven logic when the
+transactions are in-flight. The routine will get blocked, allowing the CPU to
+run other tasks, while it is waiting for a transaction to be finished.
+
+Interrupt transactions can be queued into a device, the driver automatically
+send them one-by-one in the ISR. A task can queue several transactions, and
+then do something else before the transactions are finished.
+
+.. _polling_transactions:
+
+Polling transactions
+""""""""""""""""""""
+
+The polling transactions don't rely on the interrupt, the routine keeps polling
+the status bit of the SPI peripheral until the transaction is done.
+
+All the tasks that do interrupt transactions may get blocked by the queue, at
+which point they need to wait for the ISR to run twice before the transaction
+is done. Polling transactions save the time spent on queue handling and
+context switching, resulting in a smaller transaction interval smaller. The
+disadvantage is that the the CPU is busy while these transactions are in
+flight.
+
+The ``spi_device_polling_end`` routine spends at least 1us overhead to
+unblock other tasks when the transaction is done. It is strongly recommended
+to wrap a series of polling transactions inside of ``spi_device_acquire_bus``
+and ``spi_device_release_bus`` to avoid the overhead. (See
+:ref:`bus_acquiring`)
+
+Command and address phases
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+During the command and address phases, ``cmd`` and ``addr`` field in the
+``spi_transaction_t`` struct are sent to the bus, while nothing is read at the
+same time. The default length of command and address phase are set in the
+``spi_device_interface_config_t`` and by ``spi_bus_add_device``. When the the
+flag ``SPI_TRANS_VARIABLE_CMD`` and ``SPI_TRANS_VARIABLE_ADDR`` are not set in
+the ``spi_transaction_t``,the driver automatically set the length of these
+phases to the default value as set when the device is initialized respectively.
+
+If the length of command and address phases needs to be variable, declare a
+``spi_transaction_ext_t`` descriptor, set the flag ``SPI_TRANS_VARIABLE_CMD``
+or/and ``SPI_TRANS_VARIABLE_ADDR`` in the ``flags`` of ``base`` member and
+configure the rest part of ``base`` as usual. Then the length of each phases
+will be ``command_bits`` and ``address_bits`` set in the ``spi_transaction_ext_t``.
+
+Write and read phases
+^^^^^^^^^^^^^^^^^^^^^
+
+Normally, data to be transferred to or from a device will be read from or written to a chunk of memory
+indicated by the ``rx_buffer`` and ``tx_buffer`` members of the transaction structure.
+When DMA is enabled for transfers, these buffers are highly recommended to meet the requirements as below:
+
+  1. allocated in DMA-capable memory using ``pvPortMallocCaps(size, MALLOC_CAP_DMA)``;
+  2. 32-bit aligned (start from the boundary and have length of multiples of 4 bytes).
+
+If these requirements are not satisfied, efficiency of the transaction will suffer due to the allocation and
+memcpy of temporary buffers.
+
+.. note::  Half duplex transactions with both read and write phases are not supported when using DMA. See
+  :ref:`spi_known_issues` for details and workarounds.
+
+.. _bus_acquiring:
+
+Bus acquiring
+^^^^^^^^^^^^^
+
+Sometimes you may want to send spi transactions exclusively, continuously, to
+make it as fast as possible. You may use ``spi_device_acquire_bus`` and
+``spi_device_release_bus`` to realize this. When the bus is acquired,
+transactions to other devices (no matter polling or interrupt) are pending
+until the bus is released.
+
+Using the spi_master driver
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+- Initialize a SPI bus by calling ``spi_bus_initialize``. Make sure to set the correct IO pins in
+  the ``bus_config`` struct. Take care to set signals that are not needed to -1.
+
+- Tell the driver about a SPI slave device connected to the bus by calling spi_bus_add_device.
+  Make sure to configure any timing requirements the device has in the ``dev_config`` structure.
+  You should now have a handle for the device, to be used when sending it a transaction.
+
+- To interact with the device, fill one or more spi_transaction_t structure with any transaction
+  parameters you need. Then send them either in a polling way or the interrupt way:
+
+    - :ref:`Interrupt <interrupt_transactions>`
+        Either queue all transactions by calling ``spi_device_queue_trans``,
+        and at a later time query the result using
+        ``spi_device_get_trans_result``, or handle all requests
+        synchroneously by feeding them into ``spi_device_transmit``.
+
+    - :ref:`Polling <polling_transactions>`
+        Call the ``spi_device_polling_transmit`` to send polling
+        transactions. Alternatively, you can send a polling transaction by
+        ``spi_device_polling_start`` and ``spi_device_polling_end`` if you
+        want to insert something between them.
+
+- Optional: to do back-to-back transactions to a device, call
+  ``spi_device_acquire_bus`` before and ``spi_device_release_bus`` after the
+  transactions.
+
+- Optional: to unload the driver for a device, call ``spi_bus_remove_device`` with the device
+  handle as an argument
+
+- Optional: to remove the driver for a bus, make sure no more drivers are attached and call
+  ``spi_bus_free``.
+
+Tips
+""""
+
+1. Transactions with small amount of data:
+    Sometimes, the amount of data is very small making it less than optimal allocating a separate buffer
+    for it. If the data to be transferred is 32 bits or less, it can be stored in the transaction struct
+    itself. For transmitted data, use the ``tx_data`` member for this and set the ``SPI_USE_TXDATA`` flag
+    on the transmission. For received data, use ``rx_data`` and set ``SPI_USE_RXDATA``. In both cases, do
+    not touch the ``tx_buffer`` or ``rx_buffer`` members, because they use the same memory locations
+    as ``tx_data`` and ``rx_data``.
+
+2. Transactions with integers other than uint8_t
+    The SPI peripheral reads and writes the memory byte-by-byte. By default,
+    the SPI works at MSB first mode, each bytes are sent or received from the
+    MSB to the LSB. However, if you want to send data with length which is
+    not multiples of 8 bits, unused bits are sent.
+
+    E.g. you write ``uint8_t data = 0x15`` (00010101B), and set length to
+    only 5 bits, the sent data is ``00010B`` rather than expected ``10101B``.
+
+    Moreover, ESP32 is a little-endian chip whose lowest byte is stored at
+    the very beginning address for uint16_t and uint32_t variables. Hence if
+    a uint16_t is stored in the memory, it's bit 7 is first sent, then bit 6
+    to 0, then comes its bit 15 to bit 8.
+
+    To send data other than uint8_t arrays, macros ``SPI_SWAP_DATA_TX`` is
+    provided to shift your data to the MSB and swap the MSB to the lowest
+    address; while ``SPI_SWAP_DATA_RX`` can be used to swap received data
+    from the MSB to it's correct place.
+
 GPIO matrix and IOMUX
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^

@ -113,93 +269,39 @@ IOMUX pins for SPI controllers are as below:

 note * Only the first device attaching to the bus can use CS0 pin.

-Using the spi_master driver
-^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. _mixed_transactions:

- Initialize a SPI bus by calling ``spi_bus_initialize``. Make sure to set the correct IO pins in
-  the ``bus_config`` struct. Take care to set signals that are not needed to -1.
+Notes to send mixed transactions to the same device
+"""""""""""""""""""""""""""""""""""""""""""""""""""

- Tell the driver about a SPI slave device connected to the bus by calling spi_bus_add_device.
-  Make sure to configure any timing requirements the device has in the ``dev_config`` structure.
-  You should now have a handle for the device, to be used when sending it a transaction.
+Though we suggest to send only one type (interrupt or polling) of
+transactions to one device to reduce coding complexity, it is supported to
+send both interrupt and polling transactions alternately. Notes below is to
+help you do this.

- To interact with the device, fill one or more spi_transaction_t structure with any transaction
-  parameters you need. Either queue all transactions by calling ``spi_device_queue_trans``, later
-  quering the result using ``spi_device_get_trans_result``, or handle all requests synchroneously
-  by feeding them into ``spi_device_transmit``.
+The polling transactions should be started when all the other transactions
+are finished, no matter they are polling or interrupt.

- Optional: to unload the driver for a device, call ``spi_bus_remove_device`` with the device
-  handle as an argument
+An unfinished polling transaction forbid other transactions from being sent.
+Always call ``spi_device_polling_end`` after ``spi_device_polling_start`` to
+allow other device using the bus, or allow other transactions to be started
+to the same device. You can use ``spi_device_polling_transmit`` to simplify
+this if you don't need to do something during your polling transaction.

- Optional: to remove the driver for a bus, make sure no more drivers are attached and call
-  ``spi_bus_free``.
+An in-flight polling transaction would get disturbed by the ISR operation
+caused by interrupt transactions. Always make sure all the interrupt
+transactions sent to the ISR are finished before you call
+``spi_device_polling_start``. To do that, you can call
+``spi_device_get_trans_result`` until all the transactions are returned.

-Command and address phases
-^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-During the command and address phases, ``cmd`` and ``addr`` field in the
-``spi_transaction_t`` struct are sent to the bus, while nothing is read at the
-same time. The default length of command and address phase are set in the
-``spi_device_interface_config_t`` and by ``spi_bus_add_device``. When the the
-flag ``SPI_TRANS_VARIABLE_CMD`` and ``SPI_TRANS_VARIABLE_ADDR`` are not set in
-the ``spi_transaction_t``,the driver automatically set the length of these
-phases to the default value as set when the device is initialized respectively.
-
-If the length of command and address phases needs to be variable, declare a
-``spi_transaction_ext_t`` descriptor, set the flag ``SPI_TRANS_VARIABLE_CMD``
-or/and ``SPI_TRANS_VARIABLE_ADDR`` in the ``flags`` of ``base`` member and
-configure the rest part of ``base`` as usual. Then the length of each phases
-will be ``command_bits`` and ``address_bits`` set in the ``spi_transaction_ext_t``.
-
-Write and read phases
-^^^^^^^^^^^^^^^^^^^^^
-
-Normally, data to be transferred to or from a device will be read from or written to a chunk of memory
-indicated by the ``rx_buffer`` and ``tx_buffer`` members of the transaction structure.
-When DMA is enabled for transfers, these buffers are highly recommended to meet the requirements as below:
-
-  1. allocated in DMA-capable memory using ``pvPortMallocCaps(size, MALLOC_CAP_DMA)``;
-  2. 32-bit aligned (start from the boundary and have length of multiples of 4 bytes).
-
-If these requirements are not satisfied, efficiency of the transaction will suffer due to the allocation and
-memcpy of temporary buffers.
-
-.. note::  Half duplex transactions with both read and write phases are not supported when using DMA. See
-  :ref:`spi_known_issues` for details and workarounds.
-
-Tips
-""""
-
-1. Transactions with small amount of data:
-    Sometimes, the amount of data is very small making it less than optimal allocating a separate buffer
-    for it. If the data to be transferred is 32 bits or less, it can be stored in the transaction struct
-    itself. For transmitted data, use the ``tx_data`` member for this and set the ``SPI_USE_TXDATA`` flag
-    on the transmission. For received data, use ``rx_data`` and set ``SPI_USE_RXDATA``. In both cases, do
-    not touch the ``tx_buffer`` or ``rx_buffer`` members, because they use the same memory locations
-    as ``tx_data`` and ``rx_data``.
-
-2. Transactions with integers other than uint8_t
-    The SPI peripheral reads and writes the memory byte-by-byte. By default,
-    the SPI works at MSB first mode, each bytes are sent or received from the
-    MSB to the LSB. However, if you want to send data with length which is
-    not multiples of 8 bits, unused bits are sent.
-
-    E.g. you write ``uint8_t data = 0x15`` (00010101B), and set length to
-    only 5 bits, the sent data is ``00010B`` rather than expected ``10101B``.
-
-    Moreover, ESP32 is a little-endian chip whose lowest byte is stored at
-    the very beginning address for uint16_t and uint32_t variables. Hence if
-    a uint16_t is stored in the memory, it's bit 7 is first sent, then bit 6
-    to 0, then comes its bit 15 to bit 8.
-
-    To send data other than uint8_t arrays, macros ``SPI_SWAP_DATA_TX`` is
-    provided to shift your data to the MSB and swap the MSB to the lowest
-    address; while ``SPI_SWAP_DATA_RX`` can be used to swap received data
-    from the MSB to it's correct place.
+It is strongly recommended to send mixed transactions to the same device in
+only one task to control the calling sequence of functions.

 Speed and Timing Considerations
 -------------------------------

+.. _speed_considerations:
+
 Transferring speed
 ^^^^^^^^^^^^^^^^^^

@ -207,13 +309,20 @@ There're two factors limiting the transferring speed: (1) The transaction interv
 When large transactions are used, the clock frequency determines the transferring speed; while the interval effects the
 speed a lot if small transactions are used.

-    1. Transaction interval: The interval mainly comes from the cost of FreeRTOS queues and the time switching between
-       tasks and the ISR. It also takes time for the software to setup spi peripheral registers as well as copy data to
-       FIFOs, or setup DMA links. Depending on whether the DMA is used, the interval of an one-byte transaction is around
-       25us typically.
+    1. Transaction interval: It takes time for the software to setup spi
+       peripheral registers as well as copy data to FIFOs, or setup DMA links.
+       When the interrupt transactions are used, an extra overhead is appended,
+       from the cost of FreeRTOS queues and the time switching between tasks and
+       the ISR.
+
+            1. For **interrupt transactions**, the CPU can switched to other
+               tasks when the transaction is in flight. This save the cpu time
+               but increase the interval (See :ref:`interrupt_transactions`).
+               For
+               **polling transactions**, it does not block the task but do
+               polling when the transaction is in flight. (See
+               :ref:`polling_transactions`).

-            1.  The CPU is blocked and switched to other tasks when the
-                transaction is in flight. This save the cpu time but increase the interval.
            2.  When the DMA is enabled, it needs about 2us per transaction to setup the linked list. When the master is
                transferring, it automatically read data from the linked list. If the DMA is not enabled,
                CPU has to write/read each byte to/from the FIFO by itself. Usually this is faster than 2us, but the
@ -221,20 +330,20 @@ speed a lot if small transactions are used.

       Typical transaction interval with one byte data is as below:

-       +--------+------------------+
-       |        | Transaction Time |
-       +========+==================+
-       |        | Typical (us)     |
-       +--------+------------------+
-       | DMA    | 24               |
-       +--------+------------------+
-       | No DMA | 22               |
-       +--------+------------------+
+       +--------+----------------+--------------+
+       |        | Typical Transaction Time (us) |
+       +========+================+==============+
+       |        | Interrupt      | Polling      |
+       +--------+----------------+--------------+
+       | DMA    | 24             | 8            |
+       +--------+----------------+--------------+
+       | No DMA | 22             | 7            |
+       +--------+----------------+--------------+

    2. SPI clock frequency: Each byte transferred takes 8 times of the clock period *8/fspi*. If the clock frequency is
       too high, some functions may be limited to use. See :ref:`timing_considerations`.

-For a normal transaction, the overall cost is *20+8n/Fspi[MHz]* [us] for n bytes tranferred
+For an interrupt transaction, the overall cost is *20+8n/Fspi[MHz]* [us] for n bytes tranferred
 in one transaction. Hence the transferring speed is : *n/(20+8n/Fspi)*. Example of transferring speed under 8MHz
 clock speed:

@ -381,13 +490,6 @@ table:
 +--------+------------------+----------------------+-------------------+


-Thread Safety
-------------
-
-The SPI driver API is thread safe when multiple SPI devices on the same bus are accessed from different tasks. However, the driver is not thread safe if the same SPI device is accessed from multiple tasks.
-
-In this case, it is recommended to either refactor your application so only a single task accesses each SPI device, or to add mutex locking around access of the shared device.
-
 .. _spi_known_issues:

 Known Issues
--- a/examples/peripherals/spi_master/main/spi_master_example_main.c
+++ b/examples/peripherals/spi_master/main/spi_master_example_main.c
@ -152,7 +152,13 @@ DRAM_ATTR static const lcd_init_cmd_t ili_init_cmds[]={
    {0, {0}, 0xff},
 };

-//Send a command to the LCD. Uses spi_device_transmit, which waits until the transfer is complete.
+/* Send a command to the LCD. Uses spi_device_polling_transmit, which waits
+ * until the transfer is complete.
+ *
+ * Since command transactions are usually small, they are handled in polling
+ * mode for higher speed. The overhead of interrupt transactions is more than
+ * just waiting for the transaction to complete.
+ */
 void lcd_cmd(spi_device_handle_t spi, const uint8_t cmd)
 {
    esp_err_t ret;
@ -161,11 +167,17 @@ void lcd_cmd(spi_device_handle_t spi, const uint8_t cmd)
    t.length=8;                     //Command is 8 bits
    t.tx_buffer=&cmd;               //The data is the cmd itself
    t.user=(void*)0;                //D/C needs to be set to 0
-    ret=spi_device_transmit(spi, &t);  //Transmit!
+    ret=spi_device_polling_transmit(spi, &t);  //Transmit!
    assert(ret==ESP_OK);            //Should have had no issues.
 }

-//Send data to the LCD. Uses spi_device_transmit, which waits until the transfer is complete.
+/* Send data to the LCD. Uses spi_device_polling_transmit, which waits until the
+ * transfer is complete.
+ *
+ * Since data transactions are usually small, they are handled in polling
+ * mode for higher speed. The overhead of interrupt transactions is more than
+ * just waiting for the transaction to complete.
+ */
 void lcd_data(spi_device_handle_t spi, const uint8_t *data, int len)
 {
    esp_err_t ret;
@ -175,7 +187,7 @@ void lcd_data(spi_device_handle_t spi, const uint8_t *data, int len)
    t.length=len*8;                 //Len is in bytes, transaction length is in bits.
    t.tx_buffer=data;               //Data
    t.user=(void*)1;                //D/C needs to be set to 1
-    ret=spi_device_transmit(spi, &t);  //Transmit!
+    ret=spi_device_polling_transmit(spi, &t);  //Transmit!
    assert(ret==ESP_OK);            //Should have had no issues.
 }

@ -190,7 +202,7 @@ void lcd_spi_pre_transfer_callback(spi_transaction_t *t)
 uint32_t lcd_get_id(spi_device_handle_t spi)
 {
    //get_id cmd
-    lcd_cmd( spi, 0x04);
+    lcd_cmd(spi, 0x04);

    spi_transaction_t t;
    memset(&t, 0, sizeof(t));
@ -198,7 +210,7 @@ uint32_t lcd_get_id(spi_device_handle_t spi)
    t.flags = SPI_TRANS_USE_RXDATA;
    t.user = (void*)1;

-    esp_err_t ret = spi_device_transmit(spi, &t);
+    esp_err_t ret = spi_device_polling_transmit(spi, &t);
    assert( ret == ESP_OK );

    return *(uint32_t*)t.rx_data;
@ -269,10 +281,13 @@ void lcd_init(spi_device_handle_t spi)
 }


-//To send a set of lines we have to send a command, 2 data bytes, another command, 2 more data bytes and another command
-//before sending the line data itself; a total of 6 transactions. (We can't put all of this in just one transaction
-//because the D/C line needs to be toggled in the middle.)
-//This routine queues these commands up so they get sent as quickly as possible.
+/* To send a set of lines we have to send a command, 2 data bytes, another command, 2 more data bytes and another command
+ * before sending the line data itself; a total of 6 transactions. (We can't put all of this in just one transaction
+ * because the D/C line needs to be toggled in the middle.)
+ * This routine queues these commands up as interrupt transactions so they get
+ * sent faster (compared to calling spi_device_transmit several times), and at
+ * the mean while the lines for next transactions can get calculated.
+ */
 static void send_lines(spi_device_handle_t spi, int ypos, uint16_t *linedata)
 {
    esp_err_t ret;