From 4af51833f3d2d9e5030851eb5ae7dc8dc8f57023 Mon Sep 17 00:00:00 2001
From: "Michael (Xiao Xufeng)" <xiaoxufeng@espressif.com>
Date: Wed, 31 Jan 2018 11:15:23 +0800
Subject: [PATCH] spi_master: add new polling mode to decrease time cost each
 transaction

---
 components/driver/include/driver/spi_master.h |  109 +-
 components/driver/spi_common.c                |    9 +-
 components/driver/spi_master.c                | 1132 ++++++++++++-----
 components/driver/test/test_spi_master.c      |  189 ++-
 components/idf_test/include/idf_performance.h |    2 +
 .../api-reference/peripherals/spi_master.rst  |  306 +++--
 .../spi_master/main/spi_master_example_main.c |   35 +-
 7 files changed, 1314 insertions(+), 468 deletions(-)

diff --git a/components/driver/include/driver/spi_master.h b/components/driver/include/driver/spi_master.h
index 4e64c404e..ca8246889 100644
--- a/components/driver/include/driver/spi_master.h
+++ b/components/driver/include/driver/spi_master.h
@@ -71,9 +71,9 @@ typedef struct {
     uint8_t cs_ena_pretrans;        ///< Amount of SPI bit-cycles the cs should be activated before the transmission (0-16). This only works on half-duplex transactions.
     uint8_t cs_ena_posttrans;       ///< Amount of SPI bit-cycles the cs should stay active after the transmission (0-16)
     int clock_speed_hz;             ///< Clock speed, divisors of 80MHz, in Hz. See ``SPI_MASTER_FREQ_*``.
-    int input_delay_ns;             /**< Maximum data valid time of slave. The time required between SCLK and MISO 
-        valid, including the possible clock delay from slave to master. The driver uses this value to give an extra 
-        delay before the MISO is ready on the line. Leave at 0 unless you know you need a delay. For better timing 
+    int input_delay_ns;             /**< Maximum data valid time of slave. The time required between SCLK and MISO
+        valid, including the possible clock delay from slave to master. The driver uses this value to give an extra
+        delay before the MISO is ready on the line. Leave at 0 unless you know you need a delay. For better timing
         performance at high frequency (over 8MHz), it's suggest to have the right value.
         */
     int spics_io_num;               ///< CS GPIO pin for this device, or -1 if not used
@@ -206,7 +206,10 @@ esp_err_t spi_bus_remove_device(spi_device_handle_t handle);
 
 
 /**
- * @brief Queue a SPI transaction for execution
+ * @brief Queue a SPI transaction for interrupt transaction execution. Get the result by ``spi_device_get_trans_result``.
+ *
+ * @note Normally a device cannot start (queue) polling and interrupt
+ *      transactions simultaneously.
  *
  * @param handle Device handle obtained using spi_host_add_dev
  * @param trans_desc Description of transaction to execute
@@ -216,16 +219,17 @@ esp_err_t spi_bus_remove_device(spi_device_handle_t handle);
  *         - ESP_ERR_INVALID_ARG   if parameter is invalid
  *         - ESP_ERR_TIMEOUT       if there was no room in the queue before ticks_to_wait expired
  *         - ESP_ERR_NO_MEM        if allocating DMA-capable temporary buffer failed
+ *         - ESP_ERR_INVALID_STATE if previous transactions are not finished
  *         - ESP_OK                on success
  */
 esp_err_t spi_device_queue_trans(spi_device_handle_t handle, spi_transaction_t *trans_desc, TickType_t ticks_to_wait);
 
 
 /**
- * @brief Get the result of a SPI transaction queued earlier
+ * @brief Get the result of a SPI transaction queued earlier by ``spi_device_queue_trans``.
  *
- * This routine will wait until a transaction to the given device (queued earlier with
- * spi_device_queue_trans) has succesfully completed. It will then return the description of the
+ * This routine will wait until a transaction to the given device
+ * succesfully completed. It will then return the description of the
  * completed transaction so software can inspect the result and e.g. free the memory or
  * re-use the buffers.
  *
@@ -247,10 +251,11 @@ esp_err_t spi_device_get_trans_result(spi_device_handle_t handle, spi_transactio
  * @brief Send a SPI transaction, wait for it to complete, and return the result
  *
  * This function is the equivalent of calling spi_device_queue_trans() followed by spi_device_get_trans_result().
- * Do not use this when there is still a transaction separately queued from spi_device_queue_trans() that hasn't been finalized
- * using spi_device_get_trans_result().
+ * Do not use this when there is still a transaction separately queued (started) from spi_device_queue_trans() or polling_start/transmit that hasn't been finalized.
  *
  * @note This function is not thread safe when multiple tasks access the same SPI device.
+ *      Normally a device cannot start (queue) polling and interrupt
+ *      transactions simutanuously.
  *
  * @param handle Device handle obtained using spi_host_add_dev
  * @param trans_desc Description of transaction to execute
@@ -260,6 +265,90 @@ esp_err_t spi_device_get_trans_result(spi_device_handle_t handle, spi_transactio
  */
 esp_err_t spi_device_transmit(spi_device_handle_t handle, spi_transaction_t *trans_desc);
 
+
+/**
+ * @brief Immediately start a polling transaction.
+ *
+ * @note Normally a device cannot start (queue) polling and interrupt
+ *      transactions simutanuously. Moreover, a device cannot start a new polling
+ *      transaction if another polling transaction is not finished.
+ *
+ * @param handle Device handle obtained using spi_host_add_dev
+ * @param trans_desc Description of transaction to execute
+ * @param ticks_to_wait Ticks to wait until there's room in the queue;
+ *              currently only portMAX_DELAY is supported.
+ *
+ * @return
+ *         - ESP_ERR_INVALID_ARG   if parameter is invalid
+ *         - ESP_ERR_TIMEOUT       if the device cannot get control of the bus before ``ticks_to_wait`` expired
+ *         - ESP_ERR_NO_MEM        if allocating DMA-capable temporary buffer failed
+ *         - ESP_ERR_INVALID_STATE if previous transactions are not finished
+ *         - ESP_OK                on success
+ */
+esp_err_t spi_device_polling_start(spi_device_handle_t handle, spi_transaction_t *trans_desc, TickType_t ticks_to_wait);
+
+
+/**
+ * @brief Poll until the polling transaction ends.
+ *
+ * This routine will not return until the transaction to the given device has
+ * succesfully completed. The task is not blocked, but actively busy-spins for
+ * the transaction to be completed.
+ *
+ * @param handle Device handle obtained using spi_host_add_dev
+ * @param ticks_to_wait Ticks to wait until there's a returned item; use portMAX_DELAY to never time
+                        out.
+ * @return
+ *         - ESP_ERR_INVALID_ARG   if parameter is invalid
+ *         - ESP_ERR_TIMEOUT       if the transaction cannot finish before ticks_to_wait expired
+ *         - ESP_OK                on success
+ */
+esp_err_t spi_device_polling_end(spi_device_handle_t handle, TickType_t ticks_to_wait);
+
+
+/**
+ * @brief Send a polling transaction, wait for it to complete, and return the result
+ *
+ * This function is the equivalent of calling spi_device_polling_start() followed by spi_device_polling_end().
+ * Do not use this when there is still a transaction that hasn't been finalized.
+ *
+ * @note This function is not thread safe when multiple tasks access the same SPI device.
+ *      Normally a device cannot start (queue) polling and interrupt
+ *      transactions simutanuously.
+ *
+ * @param handle Device handle obtained using spi_host_add_dev
+ * @param trans_desc Description of transaction to execute
+ * @return
+ *         - ESP_ERR_INVALID_ARG   if parameter is invalid
+ *         - ESP_OK                on success
+ */
+esp_err_t spi_device_polling_transmit(spi_device_handle_t handle, spi_transaction_t *trans_desc);
+
+
+/**
+ * @brief Occupy the SPI bus for a device to do continuous transactions.
+ *
+ * Transactions to all other devices will be put off until ``spi_device_release_bus`` is called.
+ *
+ * @note The function will wait until all the existing transactions have been sent.
+ *
+ * @param device The device to occupy the bus.
+ * @param wait Time to wait before the the bus is occupied by the device. Currently MUST set to portMAX_DELAY.
+ *
+ * @return
+ *      - ESP_ERR_INVALID_ARG : ``wait`` is not set to portMAX_DELAY.
+ *      - ESP_OK : Success.
+ */
+esp_err_t spi_device_acquire_bus(spi_device_handle_t device, TickType_t wait);
+
+/**
+ * @brief Release the SPI bus occupied by the device. All other devices can start sending transactions.
+ *
+ * @param dev The device to release the bus.
+ */
+void spi_device_release_bus(spi_device_handle_t dev);
+
+
 /**
  * @brief Calculate the working frequency that is most close to desired frequency, and also the register value.
  *
@@ -282,6 +371,7 @@ int spi_cal_clock(int fapb, int hz, int duty_cycle, uint32_t* reg_o);
   *         - -1 If too many cycles remaining, suggest to compensate half a clock.
   *         - 0 If no remaining cycles or dummy bits are not used.
   *         - positive value: cycles suggest to compensate.
+  *
   * @note If **dummy_o* is not zero, it means dummy bits should be applied in half duplex mode, and full duplex mode may not work.
   */
 void spi_get_timing(bool gpio_is_used, int input_delay_ns, int eff_clk, int* dummy_o, int* cycles_remain_o);
@@ -290,6 +380,7 @@ void spi_get_timing(bool gpio_is_used, int input_delay_ns, int eff_clk, int* dum
   * @brief Get the frequency limit of current configurations.
   *         SPI master working at this limit is OK, while above the limit, full duplex mode and DMA will not work,
   *         and dummy bits will be aplied in the half duplex mode.
+  *
   * @param gpio_is_used True if using GPIO matrix, or False if native pins are used.
   * @param input_delay_ns Input delay from SCLK launch edge to MISO data valid.
   * @return Frequency limit of current configurations.
diff --git a/components/driver/spi_common.c b/components/driver/spi_common.c
index 06873f934..cce50cb8f 100644
--- a/components/driver/spi_common.c
+++ b/components/driver/spi_common.c
@@ -31,6 +31,7 @@
 #include "driver/periph_ctrl.h"
 #include "esp_heap_caps.h"
 #include "driver/spi_common.h"
+#include "stdatomic.h"
 
 static const char *SPI_TAG = "spi";
 
@@ -50,7 +51,7 @@ typedef struct spi_device_t spi_device_t;
 #define DMA_CHANNEL_ENABLED(dma_chan)    (BIT(dma_chan-1))
 
 //Periph 1 is 'claimed' by SPI flash code.
-static bool spi_periph_claimed[3] = {true, false, false};
+static atomic_bool spi_periph_claimed[3] = { ATOMIC_VAR_INIT(true), ATOMIC_VAR_INIT(false), ATOMIC_VAR_INIT(false)};
 static uint8_t spi_dma_chan_enabled = 0;
 static portMUX_TYPE spi_dma_spinlock = portMUX_INITIALIZER_UNLOCKED;
 
@@ -58,7 +59,8 @@ static portMUX_TYPE spi_dma_spinlock = portMUX_INITIALIZER_UNLOCKED;
 //Returns true if this peripheral is successfully claimed, false if otherwise.
 bool spicommon_periph_claim(spi_host_device_t host)
 {
-    bool ret = __sync_bool_compare_and_swap(&spi_periph_claimed[host], false, true);
+    bool false_var = false;
+    bool ret = atomic_compare_exchange_strong(&spi_periph_claimed[host], &false_var, true);
     if (ret) periph_module_enable(spi_periph_signal[host].module);
     return ret;
 }
@@ -66,7 +68,8 @@ bool spicommon_periph_claim(spi_host_device_t host)
 //Returns true if this peripheral is successfully freed, false if otherwise.
 bool spicommon_periph_free(spi_host_device_t host)
 {
-    bool ret = __sync_bool_compare_and_swap(&spi_periph_claimed[host], true, false);
+    bool true_var = true;
+    bool ret = atomic_compare_exchange_strong(&spi_periph_claimed[host], &true_var, false);
     if (ret) periph_module_disable(spi_periph_signal[host].module);
     return ret;
 }
diff --git a/components/driver/spi_master.c b/components/driver/spi_master.c
index 1981ec3f2..74e4ee16b 100644
--- a/components/driver/spi_master.c
+++ b/components/driver/spi_master.c
@@ -17,22 +17,107 @@ Architecture:
 
 We can initialize a SPI driver, but we don't talk to the SPI driver itself, we address a device. A device essentially
 is a combination of SPI port and CS pin, plus some information about the specifics of communication to the device
-(timing, command/address length etc)
+(timing, command/address length etc). The arbitration between tasks is also in conception of devices.
 
-The essence of the interface to a device is a set of queues; one per device. The idea is that to send something to a SPI
-device, you allocate a transaction descriptor. It contains some information about the transfer like the lenghth, address,
-command etc, plus pointers to transmit and receive buffer. The address of this block gets pushed into the transmit queue.
-The SPI driver does its magic, and sends and retrieves the data eventually. The data gets written to the receive buffers,
-if needed the transaction descriptor is modified to indicate returned parameters and the entire thing goes into the return
-queue, where whatever software initiated the transaction can retrieve it.
+A device can work in interrupt mode and polling mode, and a third but
+complicated mode which combines the two modes above:
+
+1. Work in the ISR with a set of queues; one per device.
+
+   The idea is that to send something to a SPI device, you allocate a
+   transaction descriptor. It contains some information about the transfer
+   like the lenghth, address, command etc, plus pointers to transmit and
+   receive buffer. The address of this block gets pushed into the transmit
+   queue. The SPI driver does its magic, and sends and retrieves the data
+   eventually. The data gets written to the receive buffers, if needed the
+   transaction descriptor is modified to indicate returned parameters and
+   the entire thing goes into the return queue, where whatever software
+   initiated the transaction can retrieve it.
+
+   The entire thing is run from the SPI interrupt handler. If SPI is done
+   transmitting/receiving but nothing is in the queue, it will not clear the
+   SPI interrupt but just disable it by esp_intr_disable. This way, when a
+   new thing is sent, pushing the packet into the send queue and re-enabling
+   the interrupt (by esp_intr_enable) will trigger the interrupt again, which
+   can then take care of the sending.
+
+2. Work in the polling mode in the task.
+
+   In this mode we get rid of the ISR, FreeRTOS queue and task switching, the
+   task is no longer blocked during a transaction. This increase the cpu
+   load, but decrease the interval of SPI transactions. Each time only one
+   device (in one task) can send polling transactions, transactions to
+   other devices are blocked until the polling transaction of current device
+   is done.
+
+   In the polling mode, the queue is not used, all the operations are done
+   in the task. The task calls ``spi_device_polling_start`` to setup and start
+   a new transaction, then call ``spi_device_polling_end`` to handle the
+   return value of the transaction.
+
+   To handle the arbitration among devices, the device "temporarily" acquire
+   a bus by the ``device_acquire_bus_internal`` function, which writes
+   acquire_cs by CAS operation. Other devices which wants to send polling
+   transactions but don't own the bus will block and wait until given the
+   semaphore which indicates the ownership of bus.
+
+   In case of the ISR is still sending transactions to other devices, the ISR
+   should maintain an ``isr_free`` flag indicating that it's not doing
+   transactions. When the bus is acquired, the ISR can only send new
+   transactions to the acquiring device. The ISR will automatically disable
+   itself and send semaphore to the device if the ISR is free. If the device
+   sees the isr_free flag, it can directly start its polling transaction.
+   Otherwise it should block and wait for the semaphore from the ISR.
+
+   After the polling transaction, the driver will release the bus. During the
+   release of the bus, the driver search all other devices to see whether
+   there is any device waiting to acquire the bus, if so, acquire for it and
+   send it a semaphore if the device queue is empty, or invoke the ISR for
+   it. If all other devices don't need to acquire the bus, but there are
+   still transactions in the queues, the ISR will also be invoked.
+
+   To get better polling efficiency, user can call ``spi_device_acquire_bus``
+   function, which also calls the ``device_acquire_bus_internal`` function,
+   before a series of polling transactions to a device. The bus acquiring and
+   task switching before and after the polling transaction will be escaped.
+
+3. Mixed mode
+
+   The driver is written under the assumption that polling and interrupt
+   transactions are not happening simultaneously. When sending polling
+   transactions, it will check whether the ISR is active, which includes the
+   case the ISR is sending the interrupt transactions of the acquiring
+   device. If the ISR is still working, the routine sending a polling
+   transaction will get blocked and wait until the semaphore from the ISR
+   which indicates the ISR is free now.
+
+   A fatal case is, a polling transaction is in flight, but the ISR received
+   an interrupt transaction. The behavior of the driver is unpredictable,
+   which should be strictly forbidden.
+
+We have two bits to control the interrupt:
+
+1. The slave->trans_done bit, which is automatically asserted when a transaction is done.
+
+   This bit is cleared during an interrupt transaction, so that the interrupt
+   will be triggered when the transaction is done, or the SW can check the
+   bit to see if the transaction is done for polling transactions.
+
+   When no transaction is in-flight, the bit is kept active, so that the SW
+   can easily invoke the ISR by enable the interrupt.
+
+2. The system interrupt enable/disable, controlled by esp_intr_enable and esp_intr_disable.
+
+   The interrupt is disabled (by the ISR itself) when no interrupt transaction
+   is queued. When the bus is not occupied, any task, which queues a
+   transaction into the queue, will enable the interrupt to invoke the ISR.
+   When the bus is occupied by a device, other device will put off the
+   invoking of ISR to the moment when the bus is released. The device
+   acquiring the bus can still send interrupt transactions by enable the
+   interrupt.
 
-The entire thing is run from the SPI interrupt handler. If SPI is done transmitting/receiving but nothing is in the queue,
-it will not clear the SPI interrupt but just disable it. This way, when a new thing is sent, pushing the packet into the send
-queue and re-enabling the interrupt will trigger the interrupt again, which can then take care of the sending.
 */
 
-
-
 #include <string.h>
 #include "driver/spi_common.h"
 #include "driver/spi_master.h"
@@ -58,6 +143,7 @@ queue and re-enabling the interrupt will trigger the interrupt again, which can
 #include "driver/gpio.h"
 #include "driver/periph_ctrl.h"
 #include "esp_heap_caps.h"
+#include "stdatomic.h"
 
 typedef struct spi_device_t spi_device_t;
 typedef typeof(SPI1.clock) spi_clock_reg_t;
@@ -80,18 +166,22 @@ typedef typeof(SPI1.clock) spi_clock_reg_t;
 /// struct to hold private transaction data (like tx and rx buffer for DMA).
 typedef struct {
     spi_transaction_t   *trans;
-    uint32_t *buffer_to_send;   //equals to tx_data, if SPI_TRANS_USE_RXDATA is applied; otherwise if original buffer wasn't in DMA-capable memory, this gets the address of a temporary buffer that is;
+    const uint32_t *buffer_to_send;   //equals to tx_data, if SPI_TRANS_USE_RXDATA is applied; otherwise if original buffer wasn't in DMA-capable memory, this gets the address of a temporary buffer that is;
                                 //otherwise sets to the original buffer or NULL if no buffer is assigned.
     uint32_t *buffer_to_rcv;    // similar to buffer_to_send
-} spi_trans_priv;
+} spi_trans_priv_t;
 
 typedef struct {
-    spi_device_t *device[NO_CS];
+    _Atomic(spi_device_t*) device[NO_CS];
     intr_handle_t intr;
     spi_dev_t *hw;
-    spi_trans_priv cur_trans_buf;
-    int cur_cs;
-    int prev_cs;
+    spi_trans_priv_t cur_trans_buf;
+    int cur_cs;     //current device doing transaction
+    int prev_cs;    //last device doing transaction, used to avoid re-configure registers if the device not changed
+    atomic_int acquire_cs; //the device acquiring the bus, NO_CS if no one is doing so.
+    bool polling;   //in process of a polling, avoid of queue new transactions into ISR
+    bool isr_free;  //the isr is not sending transactions
+    bool bus_locked;//the bus is controlled by a device
     lldesc_t *dmadesc_tx;
     lldesc_t *dmadesc_rx;
     uint32_t flags;
@@ -111,11 +201,14 @@ typedef struct {
 } clock_config_t;
 
 struct spi_device_t {
+    int id;
     QueueHandle_t trans_queue;
     QueueHandle_t ret_queue;
     spi_device_interface_config_t cfg;
     clock_config_t clk_cfg;
     spi_host_t *host;
+    SemaphoreHandle_t semphr_polling;   //semaphore to notify the device it claimed the bus
+    bool        waiting;                //the device is waiting for the exclusive control of the bus
 };
 
 static spi_host_t *spihost[3];
@@ -149,7 +242,7 @@ esp_err_t spi_bus_initialize(spi_host_device_t host, const spi_bus_config_t *bus
         dma_chan_claimed=spicommon_dma_chan_claim(dma_chan);
         if ( !dma_chan_claimed ) {
             spicommon_periph_free( host );
-            SPI_CHECK(dma_chan_claimed, "dma channel already in use", ESP_ERR_INVALID_STATE);
+            SPI_CHECK(false, "dma channel already in use", ESP_ERR_INVALID_STATE);
         }
     }
 
@@ -181,7 +274,8 @@ esp_err_t spi_bus_initialize(spi_host_device_t host, const spi_bus_config_t *bus
     } else {
         //See how many dma descriptors we need and allocate them
         int dma_desc_ct=(bus_config->max_transfer_sz+SPI_MAX_DMA_LEN-1)/SPI_MAX_DMA_LEN;
-        if (dma_desc_ct==0) dma_desc_ct=1; //default to 4k when max is not given
+        if (dma_desc_ct==0) dma_desc_ct = 1; //default to 4k when max is not given
+
         spihost[host]->max_transfer_sz = dma_desc_ct*SPI_MAX_DMA_LEN;
         spihost[host]->dmadesc_tx=heap_caps_malloc(sizeof(lldesc_t)*dma_desc_ct, MALLOC_CAP_DMA);
         spihost[host]->dmadesc_rx=heap_caps_malloc(sizeof(lldesc_t)*dma_desc_ct, MALLOC_CAP_DMA);
@@ -204,6 +298,10 @@ esp_err_t spi_bus_initialize(spi_host_device_t host, const spi_bus_config_t *bus
 
     spihost[host]->cur_cs = NO_CS;
     spihost[host]->prev_cs = NO_CS;
+    atomic_store(&spihost[host]->acquire_cs, NO_CS);
+    spihost[host]->polling = false;
+    spihost[host]->isr_free = true;
+    spihost[host]->bus_locked = false;
 
     //Reset DMA
     spihost[host]->hw->dma_conf.val|=SPI_OUT_RST|SPI_IN_RST|SPI_AHBM_RST|SPI_AHBM_FIFO_RST;
@@ -246,6 +344,7 @@ cleanup:
 #endif
     }
     free(spihost[host]);
+    spihost[host] = NULL;
     spicommon_periph_free(host);
     spicommon_dma_chan_free(dma_chan);
     return ret;
@@ -257,7 +356,7 @@ esp_err_t spi_bus_free(spi_host_device_t host)
     SPI_CHECK(host>=SPI_HOST && host<=VSPI_HOST, "invalid host", ESP_ERR_INVALID_ARG);
     SPI_CHECK(spihost[host]!=NULL, "host not in use", ESP_ERR_INVALID_STATE);
     for (x=0; x<NO_CS; x++) {
-        SPI_CHECK(spihost[host]->device[x]==NULL, "not all CSses freed", ESP_ERR_INVALID_STATE);
+        SPI_CHECK(atomic_load(&spihost[host]->device[x])==NULL, "not all CSses freed", ESP_ERR_INVALID_STATE);
     }
     spicommon_bus_free_io_cfg(&spihost[host]->bus_cfg);
 
@@ -282,7 +381,7 @@ void spi_get_timing(bool gpio_is_used, int input_delay_ns, int eff_clk, int* dum
 {
     const int apbclk_kHz = APB_CLK_FREQ/1000;
     const int apbclk_n = APB_CLK_FREQ/eff_clk;
-    const int gpio_delay_ns=(gpio_is_used?25:0);
+    const int gpio_delay_ns = gpio_is_used ? 25 : 0;
 
     //calculate how many apb clocks a period has, the 1 is to compensate in case ``input_delay_ns`` is rounded off.
     int apb_period_n = (1 + input_delay_ns + gpio_delay_ns)*apbclk_kHz/1000/1000;
@@ -305,7 +404,7 @@ void spi_get_timing(bool gpio_is_used, int input_delay_ns, int eff_clk, int* dum
 int spi_get_freq_limit(bool gpio_is_used, int input_delay_ns)
 {
     const int apbclk_kHz = APB_CLK_FREQ/1000;
-    const int gpio_delay_ns=(gpio_is_used?25:0);
+    const int gpio_delay_ns = gpio_is_used ? 25 : 0;
 
     //calculate how many apb clocks a period has, the 1 is to compensate in case ``input_delay_ns`` is rounded off.
     int apb_period_n = (1 + input_delay_ns + gpio_delay_ns)*apbclk_kHz/1000/1000;
@@ -332,7 +431,8 @@ esp_err_t spi_bus_add_device(spi_host_device_t host, const spi_device_interface_
     SPI_CHECK(dev_config->clock_speed_hz > 0, "invalid sclk speed", ESP_ERR_INVALID_ARG);
     for (freecs=0; freecs<NO_CS; freecs++) {
         //See if this slot is free; reserve if it is by putting a dummy pointer in the slot. We use an atomic compare&swap to make this thread-safe.
-        if (__sync_bool_compare_and_swap(&spihost[host]->device[freecs], NULL, (spi_device_t *)1)) break;
+        void* null=NULL;
+        if (atomic_compare_exchange_strong(&spihost[host]->device[freecs], &null, (spi_device_t *)1)) break;
     }
     SPI_CHECK(freecs!=NO_CS, "no free cs pins for host", ESP_ERR_NOT_FOUND);
     //The hardware looks like it would support this, but actually setting cs_ena_pretrans when transferring in full
@@ -340,7 +440,7 @@ esp_err_t spi_bus_add_device(spi_host_device_t host, const spi_device_interface_
     SPI_CHECK( dev_config->cs_ena_pretrans <= 1 || (dev_config->address_bits == 0 && dev_config->command_bits == 0) ||
         (dev_config->flags & SPI_DEVICE_HALFDUPLEX), "In full-duplex mode, only support cs pretrans delay = 1 and without address_bits and command_bits", ESP_ERR_INVALID_ARG);
 
-    duty_cycle = (dev_config->duty_cycle_pos==0? 128: dev_config->duty_cycle_pos);
+    duty_cycle = (dev_config->duty_cycle_pos==0) ? 128 : dev_config->duty_cycle_pos;
     eff_clk = spi_cal_clock(apbclk, dev_config->clock_speed_hz, duty_cycle, (uint32_t*)&clk_reg);
     int freq_limit = spi_get_freq_limit(!(spihost[host]->flags&SPICOMMON_BUSFLAG_NATIVE_PINS), dev_config->input_delay_ns);
     //GPIO matrix can only change data at 80Mhz rate, which only allows 40MHz SPI clock.
@@ -358,12 +458,17 @@ Specify ``SPI_DEVICE_NO_DUMMY`` to ignore this checking. Then you can output dat
     spi_device_t *dev=malloc(sizeof(spi_device_t));
     if (dev==NULL) goto nomem;
     memset(dev, 0, sizeof(spi_device_t));
-    spihost[host]->device[freecs]=dev;
+    atomic_store(&spihost[host]->device[freecs], dev);
+    dev->id = freecs;
+    dev->waiting = false;
 
     //Allocate queues, set defaults
-    dev->trans_queue=xQueueCreate(dev_config->queue_size, sizeof(spi_trans_priv));
-    dev->ret_queue=xQueueCreate(dev_config->queue_size, sizeof(spi_trans_priv));
-    if (!dev->trans_queue || !dev->ret_queue) goto nomem;
+    dev->trans_queue = xQueueCreate(dev_config->queue_size, sizeof(spi_trans_priv_t));
+    dev->ret_queue = xQueueCreate(dev_config->queue_size, sizeof(spi_trans_priv_t));
+    dev->semphr_polling = xSemaphoreCreateBinary();
+    if (!dev->trans_queue || !dev->ret_queue || !dev->semphr_polling) {
+        goto nomem;
+    }
     dev->host=spihost[host];
 
     //We want to save a copy of the dev config in the dev struct.
@@ -402,6 +507,7 @@ nomem:
     if (dev) {
         if (dev->trans_queue) vQueueDelete(dev->trans_queue);
         if (dev->ret_queue) vQueueDelete(dev->ret_queue);
+        if (dev->semphr_polling) vSemaphoreDelete(dev->semphr_polling);
     }
     free(dev);
     return ESP_ERR_NO_MEM;
@@ -414,7 +520,7 @@ esp_err_t spi_bus_remove_device(spi_device_handle_t handle)
     //These checks aren't exhaustive; another thread could sneak in a transaction inbetween. These are only here to
     //catch design errors and aren't meant to be triggered during normal operation.
     SPI_CHECK(uxQueueMessagesWaiting(handle->trans_queue)==0, "Have unfinished transactions", ESP_ERR_INVALID_STATE);
-    SPI_CHECK(handle->host->cur_cs == NO_CS || handle->host->device[handle->host->cur_cs]!=handle, "Have unfinished transactions", ESP_ERR_INVALID_STATE);
+    SPI_CHECK(handle->host->cur_cs == NO_CS || atomic_load(&handle->host->device[handle->host->cur_cs])!=handle, "Have unfinished transactions", ESP_ERR_INVALID_STATE);
     SPI_CHECK(uxQueueMessagesWaiting(handle->ret_queue)==0, "Have unfinished transactions", ESP_ERR_INVALID_STATE);
 
     //return
@@ -424,18 +530,20 @@ esp_err_t spi_bus_remove_device(spi_device_handle_t handle)
     //Kill queues
     vQueueDelete(handle->trans_queue);
     vQueueDelete(handle->ret_queue);
+    vSemaphoreDelete(handle->semphr_polling);
     //Remove device from list of csses and free memory
     for (x=0; x<NO_CS; x++) {
-        if (handle->host->device[x] == handle){
-            handle->host->device[x]=NULL;
-            if ( x == handle->host->prev_cs ) handle->host->prev_cs = NO_CS;
+        if (atomic_load(&handle->host->device[x]) == handle){
+            atomic_store(&handle->host->device[x], NULL);
+            if (x == handle->host->prev_cs) handle->host->prev_cs = NO_CS;
         }
     }
     free(handle);
     return ESP_OK;
 }
 
-static int spi_freq_for_pre_n(int fapb, int pre, int n) {
+static int spi_freq_for_pre_n(int fapb, int pre, int n)
+{
     return (fapb / (pre * n));
 }
 
@@ -491,284 +599,483 @@ int spi_cal_clock(int fapb, int hz, int duty_cycle, uint32_t *reg_o)
         reg.clkcnt_l=l-1;
         eff_clk=spi_freq_for_pre_n(fapb, pre, n);
     }
-    if ( reg_o != NULL ) *reg_o = reg.val;
+    if (reg_o != NULL) *reg_o = reg.val;
     return eff_clk;
 }
 
 /*
  * Set the spi clock according to pre-calculated register value.
  */
-static inline void spi_set_clock(spi_dev_t *hw, spi_clock_reg_t reg) {
+static inline void SPI_MASTER_ISR_ATTR spi_set_clock(spi_dev_t *hw, spi_clock_reg_t reg)
+{
     hw->clock.val = reg.val;
 }
 
-//This is run in interrupt context and apart from initialization and destruction, this is the only code
-//touching the host (=spihost[x]) variable. The rest of the data arrives in queues. That is why there are
-//no muxes in this code.
+// Setup the device-specified configuration registers. Called every time a new
+// transaction is to be sent, but only apply new configurations when the device
+// changes.
+static void SPI_MASTER_ISR_ATTR spi_setup_device(spi_host_t *host, int dev_id )
+{
+    //if the configuration is already applied, skip the following.
+    if (dev_id == host->prev_cs) {
+        return;
+    }
+
+    ESP_EARLY_LOGD(SPI_TAG, "SPI device changed from %d to %d", host->prev_cs, dev_id);
+    spi_device_t *dev = atomic_load(&host->device[dev_id]);
+    //Configure clock settings
+    spi_set_clock(host->hw, dev->clk_cfg.reg);
+    //Configure bit order
+    host->hw->ctrl.rd_bit_order=(dev->cfg.flags & SPI_DEVICE_RXBIT_LSBFIRST) ? 1 : 0;
+    host->hw->ctrl.wr_bit_order=(dev->cfg.flags & SPI_DEVICE_TXBIT_LSBFIRST) ? 1 : 0;
+
+    //Configure polarity
+    if (dev->cfg.mode==0) {
+        host->hw->pin.ck_idle_edge=0;
+        host->hw->user.ck_out_edge=0;
+    } else if (dev->cfg.mode==1) {
+        host->hw->pin.ck_idle_edge=0;
+        host->hw->user.ck_out_edge=1;
+    } else if (dev->cfg.mode==2) {
+        host->hw->pin.ck_idle_edge=1;
+        host->hw->user.ck_out_edge=1;
+    } else if (dev->cfg.mode==3) {
+        host->hw->pin.ck_idle_edge=1;
+        host->hw->user.ck_out_edge=0;
+    }
+    //Configure misc stuff
+    host->hw->user.doutdin=(dev->cfg.flags & SPI_DEVICE_HALFDUPLEX) ? 0 : 1;
+    host->hw->user.sio=(dev->cfg.flags & SPI_DEVICE_3WIRE) ? 1 : 0;
+    //Configure CS pin and timing
+    host->hw->ctrl2.setup_time=dev->cfg.cs_ena_pretrans-1;
+    host->hw->user.cs_setup=dev->cfg.cs_ena_pretrans ? 1 : 0;
+    //set hold_time to 0 will not actually append delay to CS
+    //set it to 1 since we do need at least one clock of hold time in most cases
+    host->hw->ctrl2.hold_time=dev->cfg.cs_ena_posttrans;
+    if (host->hw->ctrl2.hold_time == 0) host->hw->ctrl2.hold_time = 1;
+    host->hw->user.cs_hold=1;
+
+    host->hw->pin.cs0_dis = (dev_id == 0) ? 0 : 1;
+    host->hw->pin.cs1_dis = (dev_id == 1) ? 0 : 1;
+    host->hw->pin.cs2_dis = (dev_id == 2) ? 0 : 1;
+    //Record the device just configured to save time for next time
+    host->prev_cs = dev_id;
+}
+
+/*-----------------------------------------------------------------------------
+    Arbitration Functions
+-----------------------------------------------------------------------------*/
+
+static inline void spi_isr_invoke(spi_device_t *dev)
+{
+    int acquire_cs = atomic_load(&dev->host->acquire_cs);
+    if (acquire_cs == dev->id || acquire_cs == NO_CS) {
+        esp_intr_enable(dev->host->intr);
+    }
+    //otherwise wait for bus release to invoke
+}
+
+/*  This function try to race for the arbitration between devices.
+ *  Even if this returns successfully, the ISR may be still running.
+ *  Call device_wait_for_isr_idle to make sure the ISR is done.
+ */
+static SPI_MASTER_ISR_ATTR esp_err_t device_acquire_bus_internal(spi_device_t *handle, TickType_t wait)
+{
+    spi_host_t *host = handle->host;
+    SPI_CHECK(wait==portMAX_DELAY, "acquire finite time not supported now.", ESP_ERR_INVALID_ARG);
+
+    if (atomic_load(&host->acquire_cs) == handle->id) {
+        // Quickly skip if the bus is already acquired.
+        // Usually this is only when the bus is locked.
+        assert(host->bus_locked);
+        return ESP_OK;
+    } else {
+        // Declare we are waiting for the bus so that if we get blocked later, other device or the ISR will yield to us after their using.
+        handle->waiting = true;
+        // Clear the semaphore before checking
+        xSemaphoreTake(handle->semphr_polling, 0);
+
+        int no_cs = NO_CS;
+        atomic_compare_exchange_weak(&host->acquire_cs, &no_cs, handle->id);
+        if (atomic_load(&host->acquire_cs) != handle->id) {
+            //block until the bus is acquired (help by other task)
+            BaseType_t ret = xSemaphoreTake(handle->semphr_polling, wait);
+            //TODO: add timeout handling here.
+            if (ret == pdFALSE) return ESP_ERR_TIMEOUT;
+        }
+        handle->waiting = false;
+    }
+    return ESP_OK;
+}
+
+/*  This function check for whether the ISR is done, if not, block until semaphore given.
+ */
+static inline esp_err_t device_wait_for_isr_idle(spi_device_t *handle, TickType_t wait)
+{
+    //quickly skip if the isr is already free
+    if (!handle->host->isr_free) {
+        // Clear the semaphore before checking
+        xSemaphoreTake(handle->semphr_polling, 0);
+        if (!handle->host->isr_free) {
+            //block until the the isr is free and give us the semaphore
+            BaseType_t ret = xSemaphoreTake(handle->semphr_polling, wait);
+            //TODO: add timeout handling here.
+            if (ret == pdFALSE) return ESP_ERR_TIMEOUT;
+        }
+    }
+    return ESP_OK;
+}
+
+/*  This function release the bus acquired by device_acquire_internal.
+    And it also tries to help other device to acquire the bus.
+    If the bus acquring is not needed, it goes through all device queues to see whether to invoke the ISR
+ */
+static SPI_MASTER_ISR_ATTR void device_release_bus_internal(spi_host_t *host)
+{
+    //release the bus
+    atomic_store(&host->acquire_cs, NO_CS);
+    //first try to restore the acquiring device
+    for (int i = 0; i < NO_CS; i++) {
+        //search for all registered devices
+        spi_device_t* dev = atomic_load(&host->device[i]);
+        if (dev && dev->waiting) {
+            int no_cs = NO_CS;
+            atomic_compare_exchange_weak(&host->acquire_cs, &no_cs, i);
+            if (atomic_load(&host->acquire_cs) == i) {
+                // Success to acquire for new device
+                BaseType_t ret = uxQueueMessagesWaiting(dev->trans_queue);
+                if (ret > 0) {
+                    // If there are transactions in the queue, the ISR should be invoked first
+                    // Resume the interrupt to send the task a signal
+                    spi_isr_invoke(dev);
+                } else {
+                    // Otherwise resume the task directly.
+                    xSemaphoreGive(dev->semphr_polling);
+                }
+            }
+            return;
+        }
+    }
+    //if no devices waiting, searching in device queues to see whether to recover the ISR
+    for( int i = 0; i < NO_CS; i++) {
+        spi_device_t *dev = atomic_load(&host->device[i]);
+        if (dev == NULL) continue;
+        BaseType_t ret = uxQueueMessagesWaiting(dev->trans_queue);
+        if ( ret != 0) {
+            spi_isr_invoke(dev);
+            return;
+        }
+    }
+}
+
+static inline bool device_is_polling(spi_device_t *handle)
+{
+    return atomic_load(&handle->host->acquire_cs) == handle->id && handle->host->polling;
+}
+
+/*-----------------------------------------------------------------------------
+    Working Functions
+-----------------------------------------------------------------------------*/
+
+// The function is called to send a new transaction, in ISR or in the task.
+// Setup the transaction-specified registers and linked-list used by the DMA (or FIFO if DMA is not used)
+static void SPI_MASTER_ISR_ATTR spi_new_trans(spi_device_t *dev, spi_trans_priv_t *trans_buf)
+{
+    spi_transaction_t *trans = NULL;
+    spi_host_t *host = dev->host;
+    int dev_id = dev->id;
+
+    //clear int bit
+    host->hw->slave.trans_done = 0;
+
+    trans = trans_buf->trans;
+    host->cur_cs = dev_id;
+    //We should be done with the transmission.
+    assert(host->hw->cmd.usr == 0);
+
+    //Reconfigure according to device settings, the function only has effect when the dev_id is changed.
+    spi_setup_device(host, dev_id);
+
+    //Reset DMA peripheral
+    host->hw->dma_conf.val |= SPI_OUT_RST|SPI_IN_RST|SPI_AHBM_RST|SPI_AHBM_FIFO_RST;
+    host->hw->dma_out_link.start=0;
+    host->hw->dma_in_link.start=0;
+    host->hw->dma_conf.val &= ~(SPI_OUT_RST|SPI_IN_RST|SPI_AHBM_RST|SPI_AHBM_FIFO_RST);
+    host->hw->dma_conf.out_data_burst_en=1;
+    host->hw->dma_conf.indscr_burst_en=1;
+    host->hw->dma_conf.outdscr_burst_en=1;
+    //Set up QIO/DIO if needed
+    host->hw->ctrl.val &= ~(SPI_FREAD_DUAL|SPI_FREAD_QUAD|SPI_FREAD_DIO|SPI_FREAD_QIO);
+    host->hw->user.val &= ~(SPI_FWRITE_DUAL|SPI_FWRITE_QUAD|SPI_FWRITE_DIO|SPI_FWRITE_QIO);
+    if (trans->flags & SPI_TRANS_MODE_DIO) {
+        if (trans->flags & SPI_TRANS_MODE_DIOQIO_ADDR) {
+            host->hw->ctrl.fread_dio=1;
+            host->hw->user.fwrite_dio=1;
+        } else {
+            host->hw->ctrl.fread_dual=1;
+            host->hw->user.fwrite_dual=1;
+        }
+        host->hw->ctrl.fastrd_mode=1;
+    } else if (trans->flags & SPI_TRANS_MODE_QIO) {
+        if (trans->flags & SPI_TRANS_MODE_DIOQIO_ADDR) {
+            host->hw->ctrl.fread_qio=1;
+            host->hw->user.fwrite_qio=1;
+        } else {
+            host->hw->ctrl.fread_quad=1;
+            host->hw->user.fwrite_quad=1;
+        }
+        host->hw->ctrl.fastrd_mode=1;
+    }
+
+    //Fill DMA descriptors
+    int extra_dummy=0;
+    if (trans_buf->buffer_to_rcv) {
+        if (host->dma_chan == 0) {
+            //No need to setup anything; we'll copy the result out of the work registers directly later.
+        } else {
+            spicommon_setup_dma_desc_links(host->dmadesc_rx, ((trans->rxlength+7)/8), (uint8_t*)trans_buf->buffer_to_rcv, true);
+            host->hw->dma_in_link.addr=(int)(&host->dmadesc_rx[0]) & 0xFFFFF;
+            host->hw->dma_in_link.start=1;
+        }
+        //when no_dummy is not set and in half-duplex mode, sets the dummy bit if RX phase exist
+        if (((dev->cfg.flags&SPI_DEVICE_NO_DUMMY)==0) && (dev->cfg.flags&SPI_DEVICE_HALFDUPLEX)) {
+            extra_dummy=dev->clk_cfg.dummy_num;
+        }
+    } else {
+        //DMA temporary workaround: let RX DMA work somehow to avoid the issue in ESP32 v0/v1 silicon
+        if (host->dma_chan != 0 ) {
+            host->hw->dma_in_link.addr=0;
+            host->hw->dma_in_link.start=1;
+        }
+    }
+
+    if (trans_buf->buffer_to_send) {
+        if (host->dma_chan == 0) {
+            //Need to copy data to registers manually
+            for (int x=0; x < trans->length; x+=32) {
+                //Use memcpy to get around alignment issues for txdata
+                uint32_t word;
+                memcpy(&word, &trans_buf->buffer_to_send[x/32], 4);
+                host->hw->data_buf[(x/32)]=word;
+            }
+        } else {
+            spicommon_setup_dma_desc_links(host->dmadesc_tx, (trans->length+7)/8, (uint8_t*)trans_buf->buffer_to_send, false);
+            host->hw->dma_out_link.addr=(int)(&host->dmadesc_tx[0]) & 0xFFFFF;
+            host->hw->dma_out_link.start=1;
+        }
+    }
+
+    //SPI iface needs to be configured for a delay in some cases.
+    //configure dummy bits
+    host->hw->user.usr_dummy=(dev->cfg.dummy_bits+extra_dummy) ? 1 : 0;
+    host->hw->user1.usr_dummy_cyclelen=dev->cfg.dummy_bits+extra_dummy-1;
+
+    int miso_long_delay = 0;
+    if (dev->clk_cfg.miso_delay<0) {
+        //if the data comes too late, delay half a SPI clock to improve reading
+        miso_long_delay = 1;
+        host->hw->ctrl2.miso_delay_num = 0;
+    } else {
+        //if the data is so fast that dummy_bit is used, delay some apb clocks to meet the timing
+        host->hw->ctrl2.miso_delay_num = extra_dummy ? dev->clk_cfg.miso_delay : 0;
+    }
+
+    if (miso_long_delay) {
+        switch (dev->cfg.mode) {
+        case 0:
+            host->hw->ctrl2.miso_delay_mode = 2;
+            break;
+        case 1:
+            host->hw->ctrl2.miso_delay_mode = 1;
+            break;
+        case 2:
+            host->hw->ctrl2.miso_delay_mode = 1;
+            break;
+        case 3:
+            host->hw->ctrl2.miso_delay_mode = 2;
+            break;
+        }
+    } else {
+        host->hw->ctrl2.miso_delay_mode = 0;
+    }
+
+    host->hw->mosi_dlen.usr_mosi_dbitlen=trans->length-1;
+    if ( dev->cfg.flags & SPI_DEVICE_HALFDUPLEX ) {
+        host->hw->miso_dlen.usr_miso_dbitlen=trans->rxlength-1;
+    } else {
+        //rxlength is not used in full-duplex mode
+        host->hw->miso_dlen.usr_miso_dbitlen=trans->length-1;
+    }
+
+    //Configure bit sizes, load addr and command
+    int cmdlen;
+    int addrlen;
+    if (!(dev->cfg.flags & SPI_DEVICE_HALFDUPLEX) && dev->cfg.cs_ena_pretrans != 0) {
+        /* The command and address phase is not compatible with cs_ena_pretrans
+         * in full duplex mode.
+         */
+        cmdlen = 0;
+        addrlen = 0;
+    } else {
+        if (trans->flags & SPI_TRANS_VARIABLE_CMD) {
+            cmdlen = ((spi_transaction_ext_t *)trans)->command_bits;
+        } else {
+            cmdlen = dev->cfg.command_bits;
+        }
+        if (trans->flags & SPI_TRANS_VARIABLE_ADDR) {
+            addrlen = ((spi_transaction_ext_t *)trans)->address_bits;
+        } else {
+            addrlen = dev->cfg.address_bits;
+        }
+    }
+
+    host->hw->user1.usr_addr_bitlen=addrlen-1;
+    host->hw->user2.usr_command_bitlen=cmdlen-1;
+    host->hw->user.usr_addr=addrlen ? 1 : 0;
+    host->hw->user.usr_command=cmdlen ? 1 : 0;
+
+    /* Output command will be sent from bit 7 to 0 of command_value, and
+     * then bit 15 to 8 of the same register field. Shift and swap to send
+     * more straightly.
+     */
+    host->hw->user2.usr_command_value = SPI_SWAP_DATA_TX(trans->cmd, cmdlen);
+
+    // shift the address to MSB of addr (and maybe slv_wr_status) register.
+    // output address will be sent from MSB to LSB of addr register, then comes the MSB to LSB of slv_wr_status register.
+    if (addrlen>32) {
+        host->hw->addr = trans->addr >> (addrlen- 32);
+        host->hw->slv_wr_status = trans->addr << (64 - addrlen);
+    } else {
+        host->hw->addr = trans->addr << (32 - addrlen);
+    }
+
+    if ((!(dev->cfg.flags & SPI_DEVICE_HALFDUPLEX) && trans_buf->buffer_to_rcv) ||
+        trans_buf->buffer_to_send) {
+        host->hw->user.usr_mosi = 1;
+    } else {
+        host->hw->user.usr_mosi = 0;
+    }
+    host->hw->user.usr_miso = (trans_buf->buffer_to_rcv) ? 1 : 0;
+
+    //Call pre-transmission callback, if any
+    if (dev->cfg.pre_cb) dev->cfg.pre_cb(trans);
+    //Kick off transfer
+    host->hw->cmd.usr=1;
+}
+
+// The function is called when a transaction is done, in ISR or in the task.
+// Fetch the data from FIFO and call the ``post_cb``.
+static void SPI_MASTER_ISR_ATTR spi_post_trans(spi_host_t *host)
+{
+    spi_transaction_t *cur_trans = host->cur_trans_buf.trans;
+    if (host->cur_trans_buf.buffer_to_rcv && host->dma_chan == 0 ) {
+        //Need to copy from SPI regs to result buffer.
+        for (int x = 0; x < cur_trans->rxlength; x += 32) {
+            //Do a memcpy to get around possible alignment issues in rx_buffer
+            uint32_t word = host->hw->data_buf[x / 32];
+            int len = cur_trans->rxlength - x;
+            if (len > 32) len = 32;
+            memcpy(&host->cur_trans_buf.buffer_to_rcv[x / 32], &word, (len + 7) / 8);
+        }
+    }
+    //Call post-transaction callback, if any
+    spi_device_t* dev = atomic_load(&host->device[host->cur_cs]);
+    if (dev->cfg.post_cb) dev->cfg.post_cb(cur_trans);
+
+    host->cur_cs = NO_CS;
+}
+
+// This is run in interrupt context.
 static void SPI_MASTER_ISR_ATTR spi_intr(void *arg)
 {
     int i;
     BaseType_t r;
-    BaseType_t do_yield=pdFALSE;
-    spi_trans_priv *trans_buf=NULL;
-    spi_transaction_t *trans=NULL;
-    spi_host_t *host=(spi_host_t*)arg;
+    BaseType_t do_yield = pdFALSE;
+    spi_host_t *host = (spi_host_t *)arg;
 
-    //Ignore all but the trans_done int.
-    if (!host->hw->slave.trans_done) return;
+    assert(host->hw->slave.trans_done == 1);
 
     /*------------ deal with the in-flight transaction -----------------*/
     if (host->cur_cs != NO_CS) {
-        spi_transaction_t *cur_trans = host->cur_trans_buf.trans;
         //Okay, transaction is done.
-        if (host->cur_trans_buf.buffer_to_rcv && host->dma_chan == 0 ) {
-            //Need to copy from SPI regs to result buffer.
-            for (int x=0; x < cur_trans->rxlength; x+=32) {
-                //Do a memcpy to get around possible alignment issues in rx_buffer
-                uint32_t word=host->hw->data_buf[x/32];
-                int len=cur_trans->rxlength-x;
-                if (len>32) len=32;
-                memcpy(&host->cur_trans_buf.buffer_to_rcv[x/32], &word, (len+7)/8);
-            }
+        const int cs = host->cur_cs;
+        //Tell common code DMA workaround that our DMA channel is idle. If needed, the code will do a DMA reset.
+        if (host->dma_chan) {
+            spicommon_dmaworkaround_idle(host->dma_chan);
         }
-        //Call post-transaction callback, if any
-        if (host->device[host->cur_cs]->cfg.post_cb) host->device[host->cur_cs]->cfg.post_cb(cur_trans);
-        //Return transaction descriptor.
-        xQueueSendFromISR(host->device[host->cur_cs]->ret_queue, &host->cur_trans_buf, &do_yield);
-        host->cur_cs = NO_CS;
-    }
-    //Tell common code DMA workaround that our DMA channel is idle. If needed, the code will do a DMA reset.
-    if (host->dma_chan) spicommon_dmaworkaround_idle(host->dma_chan);
 
-    /*------------ new transaction starts here ------------------*/
-    //ToDo: This is a stupidly simple low-cs-first priority scheme. Make this configurable somehow. - JD
-    for (i=0; i<NO_CS; i++) {
-        if (host->device[i]) {
-            r=xQueueReceiveFromISR(host->device[i]->trans_queue, &host->cur_trans_buf, &do_yield);
-            trans_buf = &host->cur_trans_buf;
-            //Stop looking if we have a transaction to send.
-            if (r) break;
-        }
-    }
-    if (i==NO_CS) {
-        //No packet waiting. Disable interrupt.
-        esp_intr_disable(host->intr);
+        //cur_cs is changed to NO_CS here
+        spi_post_trans(host);
+        //Return transaction descriptor.
+        xQueueSendFromISR(atomic_load(&host->device[cs])->ret_queue, &host->cur_trans_buf, &do_yield);
 #ifdef CONFIG_PM_ENABLE
         //Release APB frequency lock
         esp_pm_lock_release(host->pm_lock);
 #endif
+    }
+
+    /*------------ new transaction starts here ------------------*/
+    assert(host->cur_cs == NO_CS);
+
+    // Clear isr_free before the checking of acquire_cs so that the task will always block if we find the bus is not acquired.
+    // Small possiblility that the task is blocked but we find the bus is acquired.
+    host->isr_free = false;
+
+    /* When the bus is not occupied, the task uses esp_intr_enable to inform the ISR there's new transaction.
+     * If the queue is empty, we disable the system interrupt.
+     * We disable this first, to avoid the conflict when the task enable and the ISR disable at the same time
+     * If the transaction is sent (queue not empty), we will re-ebale it (see below).
+     */
+    esp_intr_disable( host->intr );
+    int acquire_cs = atomic_load(&host->acquire_cs);
+    if (acquire_cs != NO_CS) {
+        // Only look in the queue of the occupying device.
+        i = acquire_cs;
+        spi_device_t* dev = atomic_load(&host->device[i]);
+        assert(dev);
+        r = xQueueReceiveFromISR(dev->trans_queue, &host->cur_trans_buf, &do_yield);
+        // If the Queue is empty, skip the sending by setting i=NO_CS
+        // Otherwise i is kept as is and the transaction will be sent.
+        if (!r) {
+            // Set the free to true before resume the task
+            host->isr_free = true;
+            xSemaphoreGiveFromISR(dev->semphr_polling, &do_yield);
+            i = NO_CS;
+        }
     } else {
-        host->hw->slave.trans_done=0; //clear int bit
-        //We have a transaction. Send it.
-        spi_device_t *dev=host->device[i];
-        trans = trans_buf->trans;
-        host->cur_cs=i;
-        //We should be done with the transmission.
-        assert(host->hw->cmd.usr == 0);
-
-        //Reconfigure according to device settings, but only if we change CSses.
-        if (i!=host->prev_cs) {
-            spi_set_clock(host->hw, dev->clk_cfg.reg);
-            //Configure bit order
-            host->hw->ctrl.rd_bit_order=(dev->cfg.flags & SPI_DEVICE_RXBIT_LSBFIRST)?1:0;
-            host->hw->ctrl.wr_bit_order=(dev->cfg.flags & SPI_DEVICE_TXBIT_LSBFIRST)?1:0;
-
-            //Configure polarity
-            if (dev->cfg.mode==0) {
-                host->hw->pin.ck_idle_edge=0;
-                host->hw->user.ck_out_edge=0;
-            } else if (dev->cfg.mode==1) {
-                host->hw->pin.ck_idle_edge=0;
-                host->hw->user.ck_out_edge=1;
-            } else if (dev->cfg.mode==2) {
-                host->hw->pin.ck_idle_edge=1;
-                host->hw->user.ck_out_edge=1;
-            } else if (dev->cfg.mode==3) {
-                host->hw->pin.ck_idle_edge=1;
-                host->hw->user.ck_out_edge=0;
-            }
-            //Configure misc stuff
-            host->hw->user.doutdin=(dev->cfg.flags & SPI_DEVICE_HALFDUPLEX)?0:1;
-            host->hw->user.sio=(dev->cfg.flags & SPI_DEVICE_3WIRE)?1:0;
-
-            host->hw->ctrl2.setup_time=dev->cfg.cs_ena_pretrans-1;
-            host->hw->user.cs_setup=dev->cfg.cs_ena_pretrans?1:0;
-            //set hold_time to 0 will not actually append delay to CS
-            //set it to 1 since we do need at least one clock of hold time in most cases
-            host->hw->ctrl2.hold_time=dev->cfg.cs_ena_posttrans;
-            if ( host->hw->ctrl2.hold_time == 0 ) host->hw->ctrl2.hold_time = 1;
-            host->hw->user.cs_hold=1;
-
-            //Configure CS pin
-            host->hw->pin.cs0_dis=(i==0)?0:1;
-            host->hw->pin.cs1_dis=(i==1)?0:1;
-            host->hw->pin.cs2_dis=(i==2)?0:1;
-        }
-        host->prev_cs = i;
-        //Reset SPI peripheral
-        host->hw->dma_conf.val |= SPI_OUT_RST|SPI_IN_RST|SPI_AHBM_RST|SPI_AHBM_FIFO_RST;
-        host->hw->dma_out_link.start=0;
-        host->hw->dma_in_link.start=0;
-        host->hw->dma_conf.val &= ~(SPI_OUT_RST|SPI_IN_RST|SPI_AHBM_RST|SPI_AHBM_FIFO_RST);
-        host->hw->dma_conf.out_data_burst_en=1;
-        host->hw->dma_conf.indscr_burst_en=1;
-        host->hw->dma_conf.outdscr_burst_en=1;
-        //Set up QIO/DIO if needed
-        host->hw->ctrl.val &= ~(SPI_FREAD_DUAL|SPI_FREAD_QUAD|SPI_FREAD_DIO|SPI_FREAD_QIO);
-        host->hw->user.val &= ~(SPI_FWRITE_DUAL|SPI_FWRITE_QUAD|SPI_FWRITE_DIO|SPI_FWRITE_QIO);
-        if (trans->flags & SPI_TRANS_MODE_DIO) {
-            if (trans->flags & SPI_TRANS_MODE_DIOQIO_ADDR) {
-                host->hw->ctrl.fread_dio=1;
-                host->hw->user.fwrite_dio=1;
-            } else {
-                host->hw->ctrl.fread_dual=1;
-                host->hw->user.fwrite_dual=1;
-            }
-            host->hw->ctrl.fastrd_mode=1;
-        } else if (trans->flags & SPI_TRANS_MODE_QIO) {
-            if (trans->flags & SPI_TRANS_MODE_DIOQIO_ADDR) {
-                host->hw->ctrl.fread_qio=1;
-                host->hw->user.fwrite_qio=1;
-            } else {
-                host->hw->ctrl.fread_quad=1;
-                host->hw->user.fwrite_quad=1;
-            }
-            host->hw->ctrl.fastrd_mode=1;
-        }
-
-        //Fill DMA descriptors
-        int extra_dummy=0;
-        if (trans_buf->buffer_to_rcv) {
-            if (host->dma_chan == 0) {
-                //No need to setup anything; we'll copy the result out of the work registers directly later.
-            } else {
-                spicommon_dmaworkaround_transfer_active(host->dma_chan); //mark channel as active
-                spicommon_setup_dma_desc_links(host->dmadesc_rx, ((trans->rxlength+7)/8), (uint8_t*)trans_buf->buffer_to_rcv, true);
-                host->hw->dma_in_link.addr=(int)(&host->dmadesc_rx[0]) & 0xFFFFF;
-                host->hw->dma_in_link.start=1;
-            }
-            //when no_dummy is not set and in half-duplex mode, sets the dummy bit if RX phase exist
-            if (((dev->cfg.flags&SPI_DEVICE_NO_DUMMY)==0) && (dev->cfg.flags&SPI_DEVICE_HALFDUPLEX)) {
-                extra_dummy=dev->clk_cfg.dummy_num;
-            }
-        } else {
-            //DMA temporary workaround: let RX DMA work somehow to avoid the issue in ESP32 v0/v1 silicon
-            if (host->dma_chan != 0 ) {
-                host->hw->dma_in_link.addr=0;
-                host->hw->dma_in_link.start=1;
+        //Go through all device queues to find a transaction to send
+        //ToDo: This is a stupidly simple low-cs-first priority scheme. Make this configurable somehow. - JD
+        for (i = 0; i < NO_CS; i++) {
+            spi_device_t* dev = atomic_load(&host->device[i]);
+            if (dev) {
+                r = xQueueReceiveFromISR(dev->trans_queue, &host->cur_trans_buf, &do_yield);
+                //Stop looking if we have a transaction to send.
+                if (r) break;
             }
         }
-
-
-        if (trans_buf->buffer_to_send) {
-            if (host->dma_chan == 0) {
-                //Need to copy data to registers manually
-                for (int x=0; x < trans->length; x+=32) {
-                    //Use memcpy to get around alignment issues for txdata
-                    uint32_t word;
-                    memcpy(&word, &trans_buf->buffer_to_send[x/32], 4);
-                    host->hw->data_buf[(x/32)]=word;
-                }
-            } else {
-                spicommon_dmaworkaround_transfer_active(host->dma_chan); //mark channel as active
-                spicommon_setup_dma_desc_links(host->dmadesc_tx, (trans->length+7)/8, (uint8_t*)trans_buf->buffer_to_send, false);
-                host->hw->dma_out_link.addr=(int)(&host->dmadesc_tx[0]) & 0xFFFFF;
-                host->hw->dma_out_link.start=1;
-            }
+        if (i==NO_CS) {
+            host->isr_free = true;
         }
+    }
 
-        //SPI iface needs to be configured for a delay in some cases.
-        //configure dummy bits
-        host->hw->user.usr_dummy=(dev->cfg.dummy_bits+extra_dummy)?1:0;
-        host->hw->user1.usr_dummy_cyclelen=dev->cfg.dummy_bits+extra_dummy-1;
-
-        int miso_long_delay = 0;
-        if (dev->clk_cfg.miso_delay<0) {
-            //if the data comes too late, delay half a SPI clock to improve reading
-            miso_long_delay = 1;
-            host->hw->ctrl2.miso_delay_num = 0;
-        } else {
-            //if the data is so fast that dummy_bit is used, delay some apb clocks to meet the timing
-            host->hw->ctrl2.miso_delay_num = (extra_dummy? dev->clk_cfg.miso_delay: 0);
+    // Actually send the transaction
+    if (i != NO_CS) {
+        spi_trans_priv_t *const cur_trans_buf = &host->cur_trans_buf;
+        if (host->dma_chan != 0 && (cur_trans_buf->buffer_to_rcv || cur_trans_buf->buffer_to_send)) {
+            //mark channel as active, so that the DMA will not be reset by the slave
+            spicommon_dmaworkaround_transfer_active(host->dma_chan);
         }
-
-        if (dev->cfg.mode==0) {
-            host->hw->ctrl2.miso_delay_mode=miso_long_delay?2:0;
-        } else if (dev->cfg.mode==1) {
-            host->hw->ctrl2.miso_delay_mode=miso_long_delay?1:0;
-        } else if (dev->cfg.mode==2) {
-            host->hw->ctrl2.miso_delay_mode=miso_long_delay?1:0;
-        } else if (dev->cfg.mode==3) {
-            host->hw->ctrl2.miso_delay_mode=miso_long_delay?2:0;
-        }
-
-        host->hw->mosi_dlen.usr_mosi_dbitlen=trans->length-1;
-        if ( dev->cfg.flags & SPI_DEVICE_HALFDUPLEX ) {
-            host->hw->miso_dlen.usr_miso_dbitlen=trans->rxlength-1;
-        } else {
-            //rxlength is not used in full-duplex mode
-            host->hw->miso_dlen.usr_miso_dbitlen=trans->length-1;
-        }
-
-        //Configure bit sizes, load addr and command
-        int cmdlen;
-        int addrlen;
-        if (!(dev->cfg.flags & SPI_DEVICE_HALFDUPLEX) && dev->cfg.cs_ena_pretrans != 0) {
-            /* The command and address phase is not compatible with cs_ena_pretrans
-             * in full duplex mode.
-             */
-            cmdlen = 0;
-            addrlen = 0;
-        } else {
-            if (trans->flags & SPI_TRANS_VARIABLE_CMD) {
-                cmdlen = ((spi_transaction_ext_t *)trans)->command_bits;
-            } else {
-                cmdlen = dev->cfg.command_bits;
-            }
-            if (trans->flags & SPI_TRANS_VARIABLE_ADDR) {
-                addrlen = ((spi_transaction_ext_t *)trans)->address_bits;
-            } else {
-                addrlen = dev->cfg.address_bits;
-            }
-        }
-
-        host->hw->user1.usr_addr_bitlen=addrlen-1;
-        host->hw->user2.usr_command_bitlen=cmdlen-1;
-        host->hw->user.usr_addr=addrlen?1:0;
-        host->hw->user.usr_command=cmdlen?1:0;
-
-        /* Output command will be sent from bit 7 to 0 of command_value, and
-         * then bit 15 to 8 of the same register field. Shift and swap to send
-         * more straightly.
-         */
-        host->hw->user2.usr_command_value = SPI_SWAP_DATA_TX(trans->cmd, cmdlen);
-
-        // shift the address to MSB of addr (and maybe slv_wr_status) register.
-        // output address will be sent from MSB to LSB of addr register, then comes the MSB to LSB of slv_wr_status register.
-        if (addrlen>32) {
-            host->hw->addr = trans->addr >> (addrlen- 32);
-            host->hw->slv_wr_status = trans->addr << (64 - addrlen);
-        } else {
-            host->hw->addr = trans->addr << (32 - addrlen);
-        }
-
-        host->hw->user.usr_mosi=( (!(dev->cfg.flags & SPI_DEVICE_HALFDUPLEX) && trans_buf->buffer_to_rcv) || trans_buf->buffer_to_send)?1:0;
-        host->hw->user.usr_miso=(trans_buf->buffer_to_rcv)?1:0;
-
-        //Call pre-transmission callback, if any
-        if (dev->cfg.pre_cb) dev->cfg.pre_cb(trans);
-        //Kick off transfer
-        host->hw->cmd.usr=1;
+        spi_new_trans(atomic_load(&host->device[i]), cur_trans_buf);
+        //re-enable interrupt disabled above
+        esp_intr_enable(host->intr);
     }
     if (do_yield) portYIELD_FROM_ISR();
 }
 
-
-esp_err_t SPI_MASTER_ATTR spi_device_queue_trans(spi_device_handle_t handle, spi_transaction_t *trans_desc,  TickType_t ticks_to_wait)
+static esp_err_t check_trans_valid(spi_device_handle_t handle, spi_transaction_t *trans_desc)
 {
-    esp_err_t ret = ESP_OK;
-    BaseType_t r;
     SPI_CHECK(handle!=NULL, "invalid dev handle", ESP_ERR_INVALID_ARG);
+    spi_host_t *host = handle->host;
     //check transmission length
     SPI_CHECK((trans_desc->flags & SPI_TRANS_USE_RXDATA)==0 ||trans_desc->rxlength <= 32, "rxdata transfer > 32 bits without configured DMA", ESP_ERR_INVALID_ARG);
     SPI_CHECK((trans_desc->flags & SPI_TRANS_USE_TXDATA)==0 ||trans_desc->length <= 32, "txdata transfer > 32 bits without configured DMA", ESP_ERR_INVALID_ARG);
@@ -778,7 +1085,7 @@ esp_err_t SPI_MASTER_ATTR spi_device_queue_trans(spi_device_handle_t handle, spi
     //check working mode
     SPI_CHECK(!((trans_desc->flags & (SPI_TRANS_MODE_DIO|SPI_TRANS_MODE_QIO)) && (handle->cfg.flags & SPI_DEVICE_3WIRE)), "incompatible iface params", ESP_ERR_INVALID_ARG);
     SPI_CHECK(!((trans_desc->flags & (SPI_TRANS_MODE_DIO|SPI_TRANS_MODE_QIO)) && (!(handle->cfg.flags & SPI_DEVICE_HALFDUPLEX))), "incompatible iface params", ESP_ERR_INVALID_ARG);
-    SPI_CHECK( !(handle->cfg.flags & SPI_DEVICE_HALFDUPLEX) || handle->host->dma_chan == 0 || !(trans_desc->flags & SPI_TRANS_USE_RXDATA || trans_desc->rx_buffer != NULL)
+    SPI_CHECK( !(handle->cfg.flags & SPI_DEVICE_HALFDUPLEX) || host->dma_chan == 0 || !(trans_desc->flags & SPI_TRANS_USE_RXDATA || trans_desc->rx_buffer != NULL)
         || !(trans_desc->flags & SPI_TRANS_USE_TXDATA || trans_desc->tx_buffer!=NULL), "SPI half duplex mode does not support using DMA with both MOSI and MISO phases.", ESP_ERR_INVALID_ARG );
     //In Full duplex mode, default rxlength to be the same as length, if not filled in.
     // set rxlength to length is ok, even when rx buffer=NULL
@@ -786,106 +1093,127 @@ esp_err_t SPI_MASTER_ATTR spi_device_queue_trans(spi_device_handle_t handle, spi
         trans_desc->rxlength=trans_desc->length;
     }
 
-    spi_trans_priv trans_buf;
-    memset( &trans_buf, 0, sizeof(spi_trans_priv) );
-    trans_buf.trans = trans_desc;
+    return ESP_OK;
+}
+
+static SPI_MASTER_ATTR void uninstall_priv_desc(spi_trans_priv_t* trans_buf)
+{
+    spi_transaction_t *trans_desc = trans_buf->trans;
+    if ((void *)trans_buf->buffer_to_send != &trans_desc->tx_data[0] &&
+        trans_buf->buffer_to_send != trans_desc->tx_buffer) {
+        free((void *)trans_buf->buffer_to_send); //force free, ignore const
+    }
+    //copy data from temporary DMA-capable buffer back to IRAM buffer and free the temporary one.
+    if ((void *)trans_buf->buffer_to_rcv != &trans_desc->rx_data[0] &&
+        trans_buf->buffer_to_rcv != trans_desc->rx_buffer) {
+        if (trans_desc->flags & SPI_TRANS_USE_RXDATA) {
+            memcpy((uint8_t *) & trans_desc->rx_data[0], trans_buf->buffer_to_rcv, (trans_desc->rxlength + 7) / 8);
+        } else {
+            memcpy(trans_desc->rx_buffer, trans_buf->buffer_to_rcv, (trans_desc->rxlength + 7) / 8);
+        }
+        free(trans_buf->buffer_to_rcv);
+    }
+}
+
+static SPI_MASTER_ATTR esp_err_t setup_priv_desc(spi_transaction_t *trans_desc, spi_trans_priv_t* new_desc, bool isdma)
+{
+    *new_desc = (spi_trans_priv_t) { .trans = trans_desc, };
 
     // rx memory assign
+    uint32_t* rcv_ptr;
     if ( trans_desc->flags & SPI_TRANS_USE_RXDATA ) {
-        trans_buf.buffer_to_rcv = (uint32_t*)&trans_desc->rx_data[0];
+        rcv_ptr = (uint32_t *)&trans_desc->rx_data[0];
     } else {
         //if not use RXDATA neither rx_buffer, buffer_to_rcv assigned to NULL
-        trans_buf.buffer_to_rcv = trans_desc->rx_buffer;
+        rcv_ptr = trans_desc->rx_buffer;
     }
-    if ( trans_buf.buffer_to_rcv && handle->host->dma_chan && (!esp_ptr_dma_capable( trans_buf.buffer_to_rcv ) || ((int)trans_buf.buffer_to_rcv%4!=0)) ) {
+    if (rcv_ptr && isdma && (!esp_ptr_dma_capable(rcv_ptr) || ((int)rcv_ptr % 4 != 0))) {
         //if rxbuf in the desc not DMA-capable, malloc a new one. The rx buffer need to be length of multiples of 32 bits to avoid heap corruption.
-        ESP_LOGV( SPI_TAG, "Allocate RX buffer for DMA" );
-        trans_buf.buffer_to_rcv = heap_caps_malloc((trans_desc->rxlength+31)/8, MALLOC_CAP_DMA);
-        if ( trans_buf.buffer_to_rcv==NULL ) {
-            ret = ESP_ERR_NO_MEM;
-            goto clean_up;
-        }
+        ESP_LOGI( SPI_TAG, "Allocate RX buffer for DMA" );
+        rcv_ptr = heap_caps_malloc((trans_desc->rxlength + 31) / 8, MALLOC_CAP_DMA);
+        if (rcv_ptr == NULL) goto clean_up;
     }
+    new_desc->buffer_to_rcv = rcv_ptr;
 
-    const uint32_t *txdata;
     // tx memory assign
+    const uint32_t *send_ptr;
     if ( trans_desc->flags & SPI_TRANS_USE_TXDATA ) {
-        txdata = (uint32_t*)&trans_desc->tx_data[0];
+        send_ptr = (uint32_t *)&trans_desc->tx_data[0];
     } else {
         //if not use TXDATA neither tx_buffer, tx data assigned to NULL
-        txdata = trans_desc->tx_buffer ;
+        send_ptr = trans_desc->tx_buffer ;
     }
-    if ( txdata && handle->host->dma_chan && !esp_ptr_dma_capable( txdata )) {
+    if (send_ptr && isdma && !esp_ptr_dma_capable( send_ptr )) {
         //if txbuf in the desc not DMA-capable, malloc a new one
-        ESP_LOGV( SPI_TAG, "Allocate TX buffer for DMA" );
-        trans_buf.buffer_to_send = heap_caps_malloc((trans_desc->length+7)/8, MALLOC_CAP_DMA);
-        if ( trans_buf.buffer_to_send==NULL ) {
-            ret = ESP_ERR_NO_MEM;
-            goto clean_up;
+        ESP_LOGI( SPI_TAG, "Allocate TX buffer for DMA" );
+        uint32_t *temp = heap_caps_malloc((trans_desc->length + 7) / 8, MALLOC_CAP_DMA);
+        if (temp == NULL) goto clean_up;
+
+        memcpy( temp, send_ptr, (trans_desc->length + 7) / 8 );
+        send_ptr = temp;
         }
-        memcpy( trans_buf.buffer_to_send, txdata, (trans_desc->length+7)/8 );
-    } else {
-        // else use the original buffer (forced-conversion) or assign to NULL
-        trans_buf.buffer_to_send = (uint32_t*)txdata;
-    }
+    new_desc->buffer_to_send = send_ptr;
+
+    return ESP_OK;
+
+clean_up:
+    uninstall_priv_desc(new_desc);
+    return ESP_ERR_NO_MEM;
+}
+
+esp_err_t SPI_MASTER_ATTR spi_device_queue_trans(spi_device_handle_t handle, spi_transaction_t *trans_desc,  TickType_t ticks_to_wait)
+{
+    esp_err_t ret = check_trans_valid(handle, trans_desc);
+    if (ret != ESP_OK) return ret;
+
+    spi_host_t *host = handle->host;
+
+    SPI_CHECK( !device_is_polling(handle), "Cannot queue new transaction while previous polling transaction is not terminated.", ESP_ERR_INVALID_STATE );
+
+    spi_trans_priv_t trans_buf;
+    ret = setup_priv_desc(trans_desc, &trans_buf, (host->dma_chan!=0));
+    if (ret != ESP_OK) return ret;
 
 #ifdef CONFIG_PM_ENABLE
-    esp_pm_lock_acquire(handle->host->pm_lock);
+    esp_pm_lock_acquire(host->pm_lock);
 #endif
-    r=xQueueSend(handle->trans_queue, (void*)&trans_buf, ticks_to_wait);
+    //Send to queue and invoke the ISR.
+
+    BaseType_t r = xQueueSend(handle->trans_queue, (void *)&trans_buf, ticks_to_wait);
     if (!r) {
         ret = ESP_ERR_TIMEOUT;
 #ifdef CONFIG_PM_ENABLE
         //Release APB frequency lock
-        esp_pm_lock_release(handle->host->pm_lock);
+        esp_pm_lock_release(host->pm_lock);
 #endif
         goto clean_up;
     }
-    esp_intr_enable(handle->host->intr);
+    spi_isr_invoke(handle);
     return ESP_OK;
 
 clean_up:
-    // free malloc-ed buffer (if needed) before return.
-    if ( (void*)trans_buf.buffer_to_rcv != trans_desc->rx_buffer && (void*)trans_buf.buffer_to_rcv != &trans_desc->rx_data[0] ) {
-        free( trans_buf.buffer_to_rcv );
-    }
-    if ( (void*)trans_buf.buffer_to_send!= trans_desc->tx_buffer && (void*)trans_buf.buffer_to_send != &trans_desc->tx_data[0] ) {
-        free( trans_buf.buffer_to_send );
-    }
-    assert( ret != ESP_OK );
+    uninstall_priv_desc(&trans_buf);
     return ret;
 }
 
 esp_err_t SPI_MASTER_ATTR spi_device_get_trans_result(spi_device_handle_t handle, spi_transaction_t **trans_desc, TickType_t ticks_to_wait)
 {
     BaseType_t r;
-    spi_trans_priv trans_buf;
-
+    spi_trans_priv_t trans_buf;
     SPI_CHECK(handle!=NULL, "invalid dev handle", ESP_ERR_INVALID_ARG);
+
+    //use the interrupt, block until return
     r=xQueueReceive(handle->ret_queue, (void*)&trans_buf, ticks_to_wait);
     if (!r) {
         // The memory occupied by rx and tx DMA buffer destroyed only when receiving from the queue (transaction finished).
         // If timeout, wait and retry.
-        // Every on-flight transaction request occupies internal memory as DMA buffer if needed.
+        // Every in-flight transaction request occupies internal memory as DMA buffer if needed.
         return ESP_ERR_TIMEOUT;
     }
-
+    //release temporary buffers
+    uninstall_priv_desc(&trans_buf);
     (*trans_desc) = trans_buf.trans;
 
-    if ( (void*)trans_buf.buffer_to_send != &(*trans_desc)->tx_data[0] && trans_buf.buffer_to_send != (*trans_desc)->tx_buffer ) {
-        free( trans_buf.buffer_to_send );
-    }
-
-    //copy data from temporary DMA-capable buffer back to IRAM buffer and free the temporary one.
-    if ( (void*)trans_buf.buffer_to_rcv != &(*trans_desc)->rx_data[0] && trans_buf.buffer_to_rcv != (*trans_desc)->rx_buffer ) {
-        if ( (*trans_desc)->flags & SPI_TRANS_USE_RXDATA ) {
-            memcpy( (uint8_t*)&(*trans_desc)->rx_data[0], trans_buf.buffer_to_rcv, ((*trans_desc)->rxlength+7)/8 );
-        } else {
-            memcpy( (*trans_desc)->rx_buffer, trans_buf.buffer_to_rcv, ((*trans_desc)->rxlength+7)/8 );
-        }
-        free( trans_buf.buffer_to_rcv );
-    }
-
     return ESP_OK;
 }
 
@@ -895,11 +1223,139 @@ esp_err_t SPI_MASTER_ATTR spi_device_transmit(spi_device_handle_t handle, spi_tr
     esp_err_t ret;
     spi_transaction_t *ret_trans;
     //ToDo: check if any spi transfers in flight
-    ret=spi_device_queue_trans(handle, trans_desc, portMAX_DELAY);
-    if (ret!=ESP_OK) return ret;
-    ret=spi_device_get_trans_result(handle, &ret_trans, portMAX_DELAY);
-    if (ret!=ESP_OK) return ret;
-    assert(ret_trans==trans_desc);
+    ret = spi_device_queue_trans(handle, trans_desc, portMAX_DELAY);
+    if (ret != ESP_OK) return ret;
+
+    ret = spi_device_get_trans_result(handle, &ret_trans, portMAX_DELAY);
+    if (ret != ESP_OK) return ret;
+
+    assert(ret_trans == trans_desc);
     return ESP_OK;
 }
 
+esp_err_t SPI_MASTER_ATTR spi_device_acquire_bus(spi_device_t *device, TickType_t wait)
+{
+    spi_host_t *const host = device->host;
+    SPI_CHECK(wait==portMAX_DELAY, "acquire finite time not supported now.", ESP_ERR_INVALID_ARG);
+    SPI_CHECK( !device_is_polling(device), "Cannot acquire bus when a polling transaction is in progress.", ESP_ERR_INVALID_STATE );
+
+    esp_err_t ret = device_acquire_bus_internal(device, portMAX_DELAY);
+    if (ret != ESP_OK) return ret;
+    ret = device_wait_for_isr_idle(device, portMAX_DELAY);
+    if (ret != ESP_OK) return ret;
+
+    device->host->bus_locked = true;
+
+    ESP_LOGD(SPI_TAG, "device%d acquired the bus", device->id);
+
+#ifdef CONFIG_PM_ENABLE
+    // though we don't suggest to block the task before ``release_bus``, still allow doing so.
+    // this keeps the spi clock at 80MHz even if all tasks are blocked
+    esp_pm_lock_acquire(device->host->pm_lock);
+#endif
+    //configure the device so that we don't need to do it again in the following transactions
+    spi_setup_device( host, device->id );
+    //the DMA is also occupied by the device, all the slave devices that using DMA should wait until bus released.
+    if (host->dma_chan != 0) {
+        spicommon_dmaworkaround_transfer_active(host->dma_chan);
+    }
+    return ESP_OK;
+}
+
+// This function restore configurations required in the non-polling mode
+void SPI_MASTER_ATTR spi_device_release_bus(spi_device_t *dev)
+{
+    spi_host_t *host = dev->host;
+
+    if (device_is_polling(dev)){
+        ESP_LOGE(SPI_TAG, "Cannot release bus when a polling transaction is in progress.");
+        assert(0);
+    }
+
+    if (host->dma_chan != 0) {
+        spicommon_dmaworkaround_idle(host->dma_chan);
+    }
+    //Tell common code DMA workaround that our DMA channel is idle. If needed, the code will do a DMA reset.
+
+    //allow clock to be lower than 80MHz when all tasks blocked
+#ifdef CONFIG_PM_ENABLE
+    //Release APB frequency lock
+    esp_pm_lock_release(host->pm_lock);
+#endif
+    ESP_LOGD(SPI_TAG, "device%d release bus", dev->id);
+
+    dev->host->bus_locked = false;
+    device_release_bus_internal(dev->host);
+}
+
+esp_err_t SPI_MASTER_ISR_ATTR spi_device_polling_start(spi_device_handle_t handle, spi_transaction_t *trans_desc, TickType_t ticks_to_wait)
+{
+    esp_err_t ret;
+    SPI_CHECK(ticks_to_wait == portMAX_DELAY, "currently timeout is not available for polling transactions", ESP_ERR_INVALID_ARG);
+
+    spi_host_t *host = handle->host;
+    ret = check_trans_valid(handle, trans_desc);
+    if (ret!=ESP_OK) return ret;
+
+    SPI_CHECK( !device_is_polling(handle), "Cannot send polling transaction while the previous polling transaction is not terminated.", ESP_ERR_INVALID_STATE );
+
+    ret = setup_priv_desc(trans_desc, &host->cur_trans_buf, (handle->host->dma_chan!=0));
+    if (ret!=ESP_OK) return ret;
+
+    device_acquire_bus_internal(handle, portMAX_DELAY);
+    device_wait_for_isr_idle(handle, portMAX_DELAY);
+
+    assert(atomic_load(&host->acquire_cs) == handle->id);
+    assert(host->isr_free);
+
+    //Polling, no interrupt is used.
+    host->polling = true;
+
+    ESP_LOGV(SPI_TAG, "polling trans");
+    spi_new_trans(handle, &host->cur_trans_buf);
+
+    return ESP_OK;
+}
+
+esp_err_t SPI_MASTER_ISR_ATTR spi_device_polling_end(spi_device_handle_t handle, TickType_t ticks_to_wait)
+{
+    SPI_CHECK(handle != NULL, "invalid dev handle", ESP_ERR_INVALID_ARG);
+    spi_host_t *host = handle->host;
+
+    //if (host->acquire_cs == handle->id && host->polling) {
+    assert(host->cur_cs == atomic_load(&host->acquire_cs));
+    TickType_t start = xTaskGetTickCount();
+
+    while (!host->hw->slave.trans_done) {
+        TickType_t end = xTaskGetTickCount();
+        if (end - start > ticks_to_wait) {
+            return ESP_ERR_TIMEOUT;
+        }
+    }
+    ESP_LOGV(SPI_TAG, "polling trans done");
+    //deal with the in-flight transaction
+    spi_post_trans(host);
+    //release temporary buffers
+    uninstall_priv_desc(&host->cur_trans_buf);
+    host->polling = false;
+
+    if (!host->bus_locked) {
+        device_release_bus_internal(host);
+    }
+
+    return ESP_OK;
+}
+
+esp_err_t SPI_MASTER_ISR_ATTR spi_device_polling_transmit(spi_device_handle_t handle, spi_transaction_t* trans_desc)
+{
+    esp_err_t ret;
+    ret = spi_device_polling_start(handle, trans_desc, portMAX_DELAY);
+    if (ret != ESP_OK) return ret;
+
+    ret = spi_device_polling_end(handle, portMAX_DELAY);
+    if (ret != ESP_OK) return ret;
+
+    return ESP_OK;
+}
+
+
diff --git a/components/driver/test/test_spi_master.c b/components/driver/test/test_spi_master.c
index 9b82477d0..1ead517c6 100644
--- a/components/driver/test/test_spi_master.c
+++ b/components/driver/test/test_spi_master.c
@@ -23,6 +23,7 @@
 #include "freertos/ringbuf.h"
 #include "soc/gpio_periph.h"
 #include "sdkconfig.h"
+#include "unity_config.h"
 
 const static char TAG[] = "test_spi";
 
@@ -1335,6 +1336,7 @@ TEST_CASE("spi_speed","[spi]")
     uint32_t t_flight;
     //to get rid of the influence of randomly interrupts, we measured the performance by median value
     uint32_t t_flight_sorted[TEST_TIMES];
+    esp_err_t ret;
     int t_flight_num = 0;
 
     spi_device_handle_t spi;
@@ -1347,9 +1349,6 @@ TEST_CASE("spi_speed","[spi]")
     //first work with DMA
     speed_setup(&spi, use_dma);
 
-    //first time introduces a device switch, which costs more time. we skip this
-    spi_device_transmit(spi, &trans);
-
     //record flight time by isr, with DMA
     t_flight_num = 0;
     for (int i = 0; i < TEST_TIMES; i++) {
@@ -1364,15 +1363,34 @@ TEST_CASE("spi_speed","[spi]")
         ESP_LOGI(TAG, "%d", t_flight_sorted[i]);
     }
 
+    //acquire the bus to send polling transactions faster
+    ret = spi_device_acquire_bus(spi, portMAX_DELAY);
+    TEST_ESP_OK(ret);
+
+    //record flight time by polling and with DMA
+    t_flight_num = 0;
+    for (int i = 0; i < TEST_TIMES; i++) {
+        spi_device_polling_transmit(spi, &trans); // prime the flash cache
+        RECORD_TIME_START();
+        spi_device_polling_transmit(spi, &trans);
+        RECORD_TIME_END(&t_flight);
+        sorted_array_insert(t_flight_sorted, &t_flight_num, t_flight);
+    }
+    TEST_PERFORMANCE_LESS_THAN(SPI_PER_TRANS_POLLING, "%d us", t_flight_sorted[(TEST_TIMES+1)/2]);
+    for (int i = 0; i < TEST_TIMES; i++) {
+        ESP_LOGI(TAG, "%d", t_flight_sorted[i]);
+    }
+
+    //release the bus
+    spi_device_release_bus(spi);
+
     speed_deinit(spi);
     speed_setup(&spi, !use_dma);
 
-    //first time introduces a device switch, which costs more time. we skip this
-    spi_device_transmit(spi, &trans);
-
     //record flight time by isr, without DMA
     t_flight_num = 0;
     for (int i = 0; i < TEST_TIMES; i++) {
+        spi_device_transmit(spi, &trans); // prime the flash cache
         RECORD_TIME_START();
         spi_device_transmit(spi, &trans);
         RECORD_TIME_END(&t_flight);
@@ -1382,6 +1400,165 @@ TEST_CASE("spi_speed","[spi]")
     for (int i = 0; i < TEST_TIMES; i++) {
         ESP_LOGI(TAG, "%d", t_flight_sorted[i]);
     }
+
+    //acquire the bus to send polling transactions faster
+    ret = spi_device_acquire_bus(spi, portMAX_DELAY);
+    TEST_ESP_OK(ret);
+    //record flight time by polling, without DMA
+    t_flight_num = 0;
+    for (int i = 0; i < TEST_TIMES; i++) {
+        spi_device_polling_transmit(spi, &trans); // prime the flash cache
+        RECORD_TIME_START();
+        spi_device_polling_transmit(spi, &trans);
+        RECORD_TIME_END(&t_flight);
+        sorted_array_insert(t_flight_sorted, &t_flight_num, t_flight);
+    }
+    TEST_PERFORMANCE_LESS_THAN(SPI_PER_TRANS_POLLING_NO_DMA, "%d us", t_flight_sorted[(TEST_TIMES+1)/2]);
+    for (int i = 0; i < TEST_TIMES; i++) {
+        ESP_LOGI(TAG, "%d", t_flight_sorted[i]);
+    }
+
+    //release the bus
+    spi_device_release_bus(spi);
     speed_deinit(spi);
 }
 
+typedef struct {
+    spi_device_handle_t handle;
+    bool finished;
+} task_context_t;
+
+void spi_task1(void* arg)
+{
+    //task1 send 50 polling transactions, acquire the bus and send another 50
+    int count=0;
+    spi_transaction_t t = {
+        .flags = SPI_TRANS_USE_TXDATA,
+        .tx_data = { 0x80, 0x12, 0x34, 0x56 },
+        .length = 4*8,
+    };
+    spi_device_handle_t handle = ((task_context_t*)arg)->handle;
+    for( int j = 0; j < 50; j ++ ) {
+        TEST_ESP_OK(spi_device_polling_transmit( handle, &t ));
+        ESP_LOGI( TAG, "task1:%d", count++ );
+    }
+    TEST_ESP_OK(spi_device_acquire_bus( handle, portMAX_DELAY ));
+    for( int j = 0; j < 50; j ++ ) {
+        TEST_ESP_OK(spi_device_polling_transmit( handle, &t ));
+        ESP_LOGI( TAG, "task1:%d", count++ );
+    }
+    spi_device_release_bus(handle);
+    ESP_LOGI(TAG, "task1 terminates");
+    ((task_context_t*)arg)->finished = true;
+    vTaskDelete(NULL);
+}
+
+void spi_task2(void* arg)
+{
+    int count=0;
+    //task2 acquire the bus, send 50 polling transactions and then 50 non-polling
+    spi_transaction_t t = {
+        .flags = SPI_TRANS_USE_TXDATA,
+        .tx_data = { 0x80, 0x12, 0x34, 0x56 },
+        .length = 4*8,
+    };
+    spi_transaction_t *ret_t;
+    spi_device_handle_t handle = ((task_context_t*)arg)->handle;
+    TEST_ESP_OK(spi_device_acquire_bus( handle, portMAX_DELAY ));
+
+    for (int i = 0; i < 50; i ++) {
+        TEST_ESP_OK(spi_device_polling_transmit(handle, &t));
+        ESP_LOGI( TAG, "task2: %d", count++ );
+    }
+
+    for( int j = 0; j < 50; j ++ ) {
+        TEST_ESP_OK(spi_device_queue_trans( handle, &t, portMAX_DELAY ));
+    }
+    for( int j = 0; j < 50; j ++ ) {
+        TEST_ESP_OK(spi_device_get_trans_result(handle, &ret_t, portMAX_DELAY));
+        assert(ret_t == &t);
+        ESP_LOGI( TAG, "task2: %d", count++ );
+    }
+    spi_device_release_bus(handle);
+    vTaskDelay(1);
+    ESP_LOGI(TAG, "task2 terminates");
+    ((task_context_t*)arg)->finished = true;
+    vTaskDelete(NULL);
+}
+
+void spi_task3(void* arg)
+{
+    //task3 send 30 polling transactions, acquire the bus, send 20 polling transactions and then 50 non-polling
+    int count=0;
+    spi_transaction_t t = {
+        .flags = SPI_TRANS_USE_TXDATA,
+        .tx_data = { 0x80, 0x12, 0x34, 0x56 },
+        .length = 4*8,
+    };
+    spi_transaction_t *ret_t;
+    spi_device_handle_t handle = ((task_context_t*)arg)->handle;
+
+    for (int i = 0; i < 30; i ++) {
+        TEST_ESP_OK(spi_device_polling_transmit(handle, &t));
+        ESP_LOGI( TAG, "task3: %d", count++ );
+    }
+
+    TEST_ESP_OK(spi_device_acquire_bus( handle, portMAX_DELAY ));
+    for (int i = 0; i < 20; i ++) {
+        TEST_ESP_OK(spi_device_polling_transmit(handle, &t));
+        ESP_LOGI( TAG, "task3: %d", count++ );
+    }
+
+    for (int j = 0; j < 50; j++) {
+        TEST_ESP_OK(spi_device_queue_trans(handle, &t, portMAX_DELAY));
+    }
+    for (int j = 0; j < 50; j++) {
+        TEST_ESP_OK(spi_device_get_trans_result(handle, &ret_t, portMAX_DELAY));
+        assert(ret_t == &t);
+        ESP_LOGI(TAG, "task3: %d", count++);
+    }
+    spi_device_release_bus(handle);
+
+    ESP_LOGI(TAG, "task3 terminates");
+    ((task_context_t*)arg)->finished = true;
+    vTaskDelete(NULL);
+}
+
+TEST_CASE("spi poll tasks","[spi]")
+{
+    task_context_t context1={};
+    task_context_t context2={};
+    task_context_t context3={};
+    TaskHandle_t task1, task2, task3;
+    esp_err_t ret;
+    spi_bus_config_t buscfg=SPI_BUS_TEST_DEFAULT_CONFIG();
+    spi_device_interface_config_t devcfg=SPI_DEVICE_TEST_DEFAULT_CONFIG();
+    devcfg.queue_size = 100;
+
+    //Initialize the SPI bus and 3 devices
+    ret=spi_bus_initialize(HSPI_HOST, &buscfg, 1);
+    TEST_ASSERT(ret==ESP_OK);
+    ret=spi_bus_add_device(HSPI_HOST, &devcfg, &context1.handle);
+    TEST_ASSERT(ret==ESP_OK);
+    ret=spi_bus_add_device(HSPI_HOST, &devcfg, &context2.handle);
+    TEST_ASSERT(ret==ESP_OK);
+    ret=spi_bus_add_device(HSPI_HOST, &devcfg, &context3.handle);
+    TEST_ASSERT(ret==ESP_OK);
+
+    xTaskCreate( spi_task1, "task1", 2048, &context1, 0, &task1 );
+    xTaskCreate( spi_task2, "task2", 2048, &context2, 0, &task2 );
+    xTaskCreate( spi_task3, "task3", 2048, &context3, 0, &task3 );
+
+    for(;;){
+        vTaskDelay(10);
+        if (context1.finished && context2.finished && context3.finished) break;
+    }
+
+    TEST_ESP_OK( spi_bus_remove_device(context1.handle) );
+    TEST_ESP_OK( spi_bus_remove_device(context2.handle) );
+    TEST_ESP_OK( spi_bus_remove_device(context3.handle) );
+    TEST_ESP_OK( spi_bus_free(HSPI_HOST) );
+}
+
+
+//TODO: add a case when a non-polling transaction happened in the bus-acquiring time and then release the bus then queue a new trans
\ No newline at end of file
diff --git a/components/idf_test/include/idf_performance.h b/components/idf_test/include/idf_performance.h
index fc128eecb..8a36935c8 100644
--- a/components/idf_test/include/idf_performance.h
+++ b/components/idf_test/include/idf_performance.h
@@ -16,6 +16,8 @@
 #define IDF_PERFORMANCE_MAX_ESP_TIMER_GET_TIME_PER_CALL                         1000
 #define IDF_PERFORMANCE_MAX_SPI_PER_TRANS_NO_POLLING                            30
 #define IDF_PERFORMANCE_MAX_SPI_PER_TRANS_NO_POLLING_NO_DMA                     27
+#define IDF_PERFORMANCE_MAX_SPI_PER_TRANS_POLLING                               15
+#define IDF_PERFORMANCE_MAX_SPI_PER_TRANS_POLLING_NO_DMA                        15
 /* Due to code size & linker layout differences interacting with cache, VFS
    microbenchmark currently runs slower with PSRAM enabled. */
 #define IDF_PERFORMANCE_MAX_VFS_OPEN_WRITE_CLOSE_TIME                           20000
diff --git a/docs/en/api-reference/peripherals/spi_master.rst b/docs/en/api-reference/peripherals/spi_master.rst
index e836c1be4..9ec4952b4 100644
--- a/docs/en/api-reference/peripherals/spi_master.rst
+++ b/docs/en/api-reference/peripherals/spi_master.rst
@@ -15,7 +15,16 @@ The spi_master driver
 
 The spi_master driver allows easy communicating with SPI slave devices, even in a multithreaded environment.
 It fully transparently handles DMA transfers to read and write data and automatically takes care of
-multiplexing between different SPI slaves on the same master
+multiplexing between different SPI slaves on the same master.
+
+.. note::
+
+    **Notes about thread safety**
+
+    The SPI driver API is thread safe when multiple SPI devices on the same bus are accessed from different tasks. However, the driver is not thread safe if the same SPI device is accessed from multiple tasks.
+
+    In this case, it is recommended to either refactor your application so only a single task accesses each SPI device, or to add mutex locking around access of the shared device.
+
 
 Terminology
 ^^^^^^^^^^^
@@ -45,7 +54,6 @@ The spi_master driver uses the following terms:
   CS going inactive again. Transactions are atomic, as in they will never be interrupted by another
   transaction.
 
-
 SPI transactions
 ^^^^^^^^^^^^^^^^
 
@@ -73,6 +81,154 @@ Something similar is true for the read and write phase: not every transaction ne
 as well as data to be read. When ``rx_buffer`` is NULL (and SPI_USE_RXDATA) is not set) the read phase
 is skipped. When ``tx_buffer`` is NULL (and SPI_USE_TXDATA) is not set) the write phase is skipped.
 
+The driver offers two different kinds of transactions: the interrupt
+transactions and the polling transactions. Each device can choose one kind of
+transaction to send. See :ref:`mixed_transactions` if your device do require
+both kinds of transactions.
+
+.. _interrupt_transactions:
+
+Interrupt transactions
+""""""""""""""""""""""""
+
+The interrupt transactions use an interrupt-driven logic when the
+transactions are in-flight. The routine will get blocked, allowing the CPU to
+run other tasks, while it is waiting for a transaction to be finished.
+
+Interrupt transactions can be queued into a device, the driver automatically
+send them one-by-one in the ISR. A task can queue several transactions, and
+then do something else before the transactions are finished.
+
+.. _polling_transactions:
+
+Polling transactions
+""""""""""""""""""""
+
+The polling transactions don't rely on the interrupt, the routine keeps polling
+the status bit of the SPI peripheral until the transaction is done.
+
+All the tasks that do interrupt transactions may get blocked by the queue, at
+which point they need to wait for the ISR to run twice before the transaction
+is done. Polling transactions save the time spent on queue handling and
+context switching, resulting in a smaller transaction interval smaller. The
+disadvantage is that the the CPU is busy while these transactions are in
+flight.
+
+The ``spi_device_polling_end`` routine spends at least 1us overhead to
+unblock other tasks when the transaction is done. It is strongly recommended
+to wrap a series of polling transactions inside of ``spi_device_acquire_bus``
+and ``spi_device_release_bus`` to avoid the overhead. (See
+:ref:`bus_acquiring`)
+
+Command and address phases
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+During the command and address phases, ``cmd`` and ``addr`` field in the
+``spi_transaction_t`` struct are sent to the bus, while nothing is read at the
+same time. The default length of command and address phase are set in the
+``spi_device_interface_config_t`` and by ``spi_bus_add_device``. When the the
+flag ``SPI_TRANS_VARIABLE_CMD`` and ``SPI_TRANS_VARIABLE_ADDR`` are not set in
+the ``spi_transaction_t``,the driver automatically set the length of these
+phases to the default value as set when the device is initialized respectively.
+
+If the length of command and address phases needs to be variable, declare a
+``spi_transaction_ext_t`` descriptor, set the flag ``SPI_TRANS_VARIABLE_CMD``
+or/and ``SPI_TRANS_VARIABLE_ADDR`` in the ``flags`` of ``base`` member and
+configure the rest part of ``base`` as usual. Then the length of each phases
+will be ``command_bits`` and ``address_bits`` set in the ``spi_transaction_ext_t``.
+
+Write and read phases
+^^^^^^^^^^^^^^^^^^^^^
+
+Normally, data to be transferred to or from a device will be read from or written to a chunk of memory
+indicated by the ``rx_buffer`` and ``tx_buffer`` members of the transaction structure.
+When DMA is enabled for transfers, these buffers are highly recommended to meet the requirements as below:
+
+  1. allocated in DMA-capable memory using ``pvPortMallocCaps(size, MALLOC_CAP_DMA)``;
+  2. 32-bit aligned (start from the boundary and have length of multiples of 4 bytes).
+
+If these requirements are not satisfied, efficiency of the transaction will suffer due to the allocation and
+memcpy of temporary buffers.
+
+.. note::  Half duplex transactions with both read and write phases are not supported when using DMA. See
+  :ref:`spi_known_issues` for details and workarounds.
+
+.. _bus_acquiring:
+
+Bus acquiring
+^^^^^^^^^^^^^
+
+Sometimes you may want to send spi transactions exclusively, continuously, to
+make it as fast as possible. You may use ``spi_device_acquire_bus`` and
+``spi_device_release_bus`` to realize this. When the bus is acquired,
+transactions to other devices (no matter polling or interrupt) are pending
+until the bus is released.
+
+Using the spi_master driver
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+- Initialize a SPI bus by calling ``spi_bus_initialize``. Make sure to set the correct IO pins in
+  the ``bus_config`` struct. Take care to set signals that are not needed to -1.
+
+- Tell the driver about a SPI slave device connected to the bus by calling spi_bus_add_device.
+  Make sure to configure any timing requirements the device has in the ``dev_config`` structure.
+  You should now have a handle for the device, to be used when sending it a transaction.
+
+- To interact with the device, fill one or more spi_transaction_t structure with any transaction
+  parameters you need. Then send them either in a polling way or the interrupt way:
+
+    - :ref:`Interrupt <interrupt_transactions>`
+        Either queue all transactions by calling ``spi_device_queue_trans``,
+        and at a later time query the result using
+        ``spi_device_get_trans_result``, or handle all requests
+        synchroneously by feeding them into ``spi_device_transmit``.
+
+    - :ref:`Polling <polling_transactions>`
+        Call the ``spi_device_polling_transmit`` to send polling
+        transactions. Alternatively, you can send a polling transaction by
+        ``spi_device_polling_start`` and ``spi_device_polling_end`` if you
+        want to insert something between them.
+
+- Optional: to do back-to-back transactions to a device, call
+  ``spi_device_acquire_bus`` before and ``spi_device_release_bus`` after the
+  transactions.
+
+- Optional: to unload the driver for a device, call ``spi_bus_remove_device`` with the device
+  handle as an argument
+
+- Optional: to remove the driver for a bus, make sure no more drivers are attached and call
+  ``spi_bus_free``.
+
+Tips
+""""
+
+1. Transactions with small amount of data:
+    Sometimes, the amount of data is very small making it less than optimal allocating a separate buffer
+    for it. If the data to be transferred is 32 bits or less, it can be stored in the transaction struct
+    itself. For transmitted data, use the ``tx_data`` member for this and set the ``SPI_USE_TXDATA`` flag
+    on the transmission. For received data, use ``rx_data`` and set ``SPI_USE_RXDATA``. In both cases, do
+    not touch the ``tx_buffer`` or ``rx_buffer`` members, because they use the same memory locations
+    as ``tx_data`` and ``rx_data``.
+
+2. Transactions with integers other than uint8_t
+    The SPI peripheral reads and writes the memory byte-by-byte. By default,
+    the SPI works at MSB first mode, each bytes are sent or received from the
+    MSB to the LSB. However, if you want to send data with length which is
+    not multiples of 8 bits, unused bits are sent.
+
+    E.g. you write ``uint8_t data = 0x15`` (00010101B), and set length to
+    only 5 bits, the sent data is ``00010B`` rather than expected ``10101B``.
+
+    Moreover, ESP32 is a little-endian chip whose lowest byte is stored at
+    the very beginning address for uint16_t and uint32_t variables. Hence if
+    a uint16_t is stored in the memory, it's bit 7 is first sent, then bit 6
+    to 0, then comes its bit 15 to bit 8.
+
+    To send data other than uint8_t arrays, macros ``SPI_SWAP_DATA_TX`` is
+    provided to shift your data to the MSB and swap the MSB to the lowest
+    address; while ``SPI_SWAP_DATA_RX`` can be used to swap received data
+    from the MSB to it's correct place.
+
 GPIO matrix and IOMUX
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -113,93 +269,39 @@ IOMUX pins for SPI controllers are as below:
 
 note * Only the first device attaching to the bus can use CS0 pin.
 
-Using the spi_master driver
-^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. _mixed_transactions:
 
-- Initialize a SPI bus by calling ``spi_bus_initialize``. Make sure to set the correct IO pins in
-  the ``bus_config`` struct. Take care to set signals that are not needed to -1.
+Notes to send mixed transactions to the same device
+"""""""""""""""""""""""""""""""""""""""""""""""""""
 
-- Tell the driver about a SPI slave device connected to the bus by calling spi_bus_add_device.
-  Make sure to configure any timing requirements the device has in the ``dev_config`` structure.
-  You should now have a handle for the device, to be used when sending it a transaction.
+Though we suggest to send only one type (interrupt or polling) of
+transactions to one device to reduce coding complexity, it is supported to
+send both interrupt and polling transactions alternately. Notes below is to
+help you do this.
 
-- To interact with the device, fill one or more spi_transaction_t structure with any transaction
-  parameters you need. Either queue all transactions by calling ``spi_device_queue_trans``, later
-  quering the result using ``spi_device_get_trans_result``, or handle all requests synchroneously
-  by feeding them into ``spi_device_transmit``.
+The polling transactions should be started when all the other transactions
+are finished, no matter they are polling or interrupt.
 
-- Optional: to unload the driver for a device, call ``spi_bus_remove_device`` with the device
-  handle as an argument
+An unfinished polling transaction forbid other transactions from being sent.
+Always call ``spi_device_polling_end`` after ``spi_device_polling_start`` to
+allow other device using the bus, or allow other transactions to be started
+to the same device. You can use ``spi_device_polling_transmit`` to simplify
+this if you don't need to do something during your polling transaction.
 
-- Optional: to remove the driver for a bus, make sure no more drivers are attached and call
-  ``spi_bus_free``.
+An in-flight polling transaction would get disturbed by the ISR operation
+caused by interrupt transactions. Always make sure all the interrupt
+transactions sent to the ISR are finished before you call
+``spi_device_polling_start``. To do that, you can call
+``spi_device_get_trans_result`` until all the transactions are returned.
 
-Command and address phases
-^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-During the command and address phases, ``cmd`` and ``addr`` field in the
-``spi_transaction_t`` struct are sent to the bus, while nothing is read at the
-same time. The default length of command and address phase are set in the
-``spi_device_interface_config_t`` and by ``spi_bus_add_device``. When the the
-flag ``SPI_TRANS_VARIABLE_CMD`` and ``SPI_TRANS_VARIABLE_ADDR`` are not set in
-the ``spi_transaction_t``,the driver automatically set the length of these
-phases to the default value as set when the device is initialized respectively.
-
-If the length of command and address phases needs to be variable, declare a
-``spi_transaction_ext_t`` descriptor, set the flag ``SPI_TRANS_VARIABLE_CMD``
-or/and ``SPI_TRANS_VARIABLE_ADDR`` in the ``flags`` of ``base`` member and
-configure the rest part of ``base`` as usual. Then the length of each phases
-will be ``command_bits`` and ``address_bits`` set in the ``spi_transaction_ext_t``.
-
-Write and read phases
-^^^^^^^^^^^^^^^^^^^^^
-
-Normally, data to be transferred to or from a device will be read from or written to a chunk of memory
-indicated by the ``rx_buffer`` and ``tx_buffer`` members of the transaction structure.
-When DMA is enabled for transfers, these buffers are highly recommended to meet the requirements as below:
-
-  1. allocated in DMA-capable memory using ``pvPortMallocCaps(size, MALLOC_CAP_DMA)``;
-  2. 32-bit aligned (start from the boundary and have length of multiples of 4 bytes).
-
-If these requirements are not satisfied, efficiency of the transaction will suffer due to the allocation and
-memcpy of temporary buffers.
-
-.. note::  Half duplex transactions with both read and write phases are not supported when using DMA. See
-  :ref:`spi_known_issues` for details and workarounds.
-
-Tips
-""""
-
-1. Transactions with small amount of data:
-    Sometimes, the amount of data is very small making it less than optimal allocating a separate buffer
-    for it. If the data to be transferred is 32 bits or less, it can be stored in the transaction struct
-    itself. For transmitted data, use the ``tx_data`` member for this and set the ``SPI_USE_TXDATA`` flag
-    on the transmission. For received data, use ``rx_data`` and set ``SPI_USE_RXDATA``. In both cases, do
-    not touch the ``tx_buffer`` or ``rx_buffer`` members, because they use the same memory locations
-    as ``tx_data`` and ``rx_data``.
-
-2. Transactions with integers other than uint8_t
-    The SPI peripheral reads and writes the memory byte-by-byte. By default,
-    the SPI works at MSB first mode, each bytes are sent or received from the
-    MSB to the LSB. However, if you want to send data with length which is
-    not multiples of 8 bits, unused bits are sent.
-
-    E.g. you write ``uint8_t data = 0x15`` (00010101B), and set length to
-    only 5 bits, the sent data is ``00010B`` rather than expected ``10101B``.
-
-    Moreover, ESP32 is a little-endian chip whose lowest byte is stored at
-    the very beginning address for uint16_t and uint32_t variables. Hence if
-    a uint16_t is stored in the memory, it's bit 7 is first sent, then bit 6
-    to 0, then comes its bit 15 to bit 8.
-
-    To send data other than uint8_t arrays, macros ``SPI_SWAP_DATA_TX`` is
-    provided to shift your data to the MSB and swap the MSB to the lowest
-    address; while ``SPI_SWAP_DATA_RX`` can be used to swap received data
-    from the MSB to it's correct place.
+It is strongly recommended to send mixed transactions to the same device in
+only one task to control the calling sequence of functions.
 
 Speed and Timing Considerations
 -------------------------------
 
+.. _speed_considerations:
+
 Transferring speed
 ^^^^^^^^^^^^^^^^^^
 
@@ -207,13 +309,20 @@ There're two factors limiting the transferring speed: (1) The transaction interv
 When large transactions are used, the clock frequency determines the transferring speed; while the interval effects the
 speed a lot if small transactions are used.
 
-    1. Transaction interval: The interval mainly comes from the cost of FreeRTOS queues and the time switching between
-       tasks and the ISR. It also takes time for the software to setup spi peripheral registers as well as copy data to
-       FIFOs, or setup DMA links. Depending on whether the DMA is used, the interval of an one-byte transaction is around
-       25us typically.
+    1. Transaction interval: It takes time for the software to setup spi
+       peripheral registers as well as copy data to FIFOs, or setup DMA links.
+       When the interrupt transactions are used, an extra overhead is appended,
+       from the cost of FreeRTOS queues and the time switching between tasks and
+       the ISR.
+
+            1. For **interrupt transactions**, the CPU can switched to other
+               tasks when the transaction is in flight. This save the cpu time
+               but increase the interval (See :ref:`interrupt_transactions`).
+               For
+               **polling transactions**, it does not block the task but do
+               polling when the transaction is in flight. (See
+               :ref:`polling_transactions`).
 
-            1.  The CPU is blocked and switched to other tasks when the
-                transaction is in flight. This save the cpu time but increase the interval.
             2.  When the DMA is enabled, it needs about 2us per transaction to setup the linked list. When the master is
                 transferring, it automatically read data from the linked list. If the DMA is not enabled,
                 CPU has to write/read each byte to/from the FIFO by itself. Usually this is faster than 2us, but the
@@ -221,20 +330,20 @@ speed a lot if small transactions are used.
 
        Typical transaction interval with one byte data is as below:
 
-       +--------+------------------+
-       |        | Transaction Time |
-       +========+==================+
-       |        | Typical (us)     |
-       +--------+------------------+
-       | DMA    | 24               |
-       +--------+------------------+
-       | No DMA | 22               |
-       +--------+------------------+
+       +--------+----------------+--------------+
+       |        | Typical Transaction Time (us) |
+       +========+================+==============+
+       |        | Interrupt      | Polling      |
+       +--------+----------------+--------------+
+       | DMA    | 24             | 8            |
+       +--------+----------------+--------------+
+       | No DMA | 22             | 7            |
+       +--------+----------------+--------------+
 
     2. SPI clock frequency: Each byte transferred takes 8 times of the clock period *8/fspi*. If the clock frequency is
        too high, some functions may be limited to use. See :ref:`timing_considerations`.
 
-For a normal transaction, the overall cost is *20+8n/Fspi[MHz]* [us] for n bytes tranferred
+For an interrupt transaction, the overall cost is *20+8n/Fspi[MHz]* [us] for n bytes tranferred
 in one transaction. Hence the transferring speed is : *n/(20+8n/Fspi)*. Example of transferring speed under 8MHz
 clock speed:
 
@@ -381,13 +490,6 @@ table:
 +--------+------------------+----------------------+-------------------+
 
 
-Thread Safety
--------------
-
-The SPI driver API is thread safe when multiple SPI devices on the same bus are accessed from different tasks. However, the driver is not thread safe if the same SPI device is accessed from multiple tasks.
-
-In this case, it is recommended to either refactor your application so only a single task accesses each SPI device, or to add mutex locking around access of the shared device.
-
 .. _spi_known_issues:
 
 Known Issues
diff --git a/examples/peripherals/spi_master/main/spi_master_example_main.c b/examples/peripherals/spi_master/main/spi_master_example_main.c
index 8b55fe268..346488bd4 100644
--- a/examples/peripherals/spi_master/main/spi_master_example_main.c
+++ b/examples/peripherals/spi_master/main/spi_master_example_main.c
@@ -152,7 +152,13 @@ DRAM_ATTR static const lcd_init_cmd_t ili_init_cmds[]={
     {0, {0}, 0xff},
 };
 
-//Send a command to the LCD. Uses spi_device_transmit, which waits until the transfer is complete.
+/* Send a command to the LCD. Uses spi_device_polling_transmit, which waits
+ * until the transfer is complete.
+ *
+ * Since command transactions are usually small, they are handled in polling
+ * mode for higher speed. The overhead of interrupt transactions is more than
+ * just waiting for the transaction to complete.
+ */
 void lcd_cmd(spi_device_handle_t spi, const uint8_t cmd)
 {
     esp_err_t ret;
@@ -161,11 +167,17 @@ void lcd_cmd(spi_device_handle_t spi, const uint8_t cmd)
     t.length=8;                     //Command is 8 bits
     t.tx_buffer=&cmd;               //The data is the cmd itself
     t.user=(void*)0;                //D/C needs to be set to 0
-    ret=spi_device_transmit(spi, &t);  //Transmit!
+    ret=spi_device_polling_transmit(spi, &t);  //Transmit!
     assert(ret==ESP_OK);            //Should have had no issues.
 }
 
-//Send data to the LCD. Uses spi_device_transmit, which waits until the transfer is complete.
+/* Send data to the LCD. Uses spi_device_polling_transmit, which waits until the
+ * transfer is complete.
+ *
+ * Since data transactions are usually small, they are handled in polling
+ * mode for higher speed. The overhead of interrupt transactions is more than
+ * just waiting for the transaction to complete.
+ */
 void lcd_data(spi_device_handle_t spi, const uint8_t *data, int len)
 {
     esp_err_t ret;
@@ -175,7 +187,7 @@ void lcd_data(spi_device_handle_t spi, const uint8_t *data, int len)
     t.length=len*8;                 //Len is in bytes, transaction length is in bits.
     t.tx_buffer=data;               //Data
     t.user=(void*)1;                //D/C needs to be set to 1
-    ret=spi_device_transmit(spi, &t);  //Transmit!
+    ret=spi_device_polling_transmit(spi, &t);  //Transmit!
     assert(ret==ESP_OK);            //Should have had no issues.
 }
 
@@ -190,7 +202,7 @@ void lcd_spi_pre_transfer_callback(spi_transaction_t *t)
 uint32_t lcd_get_id(spi_device_handle_t spi)
 {
     //get_id cmd
-    lcd_cmd( spi, 0x04);
+    lcd_cmd(spi, 0x04);
 
     spi_transaction_t t;
     memset(&t, 0, sizeof(t));
@@ -198,7 +210,7 @@ uint32_t lcd_get_id(spi_device_handle_t spi)
     t.flags = SPI_TRANS_USE_RXDATA;
     t.user = (void*)1;
 
-    esp_err_t ret = spi_device_transmit(spi, &t);
+    esp_err_t ret = spi_device_polling_transmit(spi, &t);
     assert( ret == ESP_OK );
 
     return *(uint32_t*)t.rx_data;
@@ -269,10 +281,13 @@ void lcd_init(spi_device_handle_t spi)
 }
 
 
-//To send a set of lines we have to send a command, 2 data bytes, another command, 2 more data bytes and another command
-//before sending the line data itself; a total of 6 transactions. (We can't put all of this in just one transaction
-//because the D/C line needs to be toggled in the middle.)
-//This routine queues these commands up so they get sent as quickly as possible.
+/* To send a set of lines we have to send a command, 2 data bytes, another command, 2 more data bytes and another command
+ * before sending the line data itself; a total of 6 transactions. (We can't put all of this in just one transaction
+ * because the D/C line needs to be toggled in the middle.)
+ * This routine queues these commands up as interrupt transactions so they get
+ * sent faster (compared to calling spi_device_transmit several times), and at
+ * the mean while the lines for next transactions can get calculated.
+ */
 static void send_lines(spi_device_handle_t spi, int ypos, uint16_t *linedata)
 {
     esp_err_t ret;