ECSPI transfers only one word per frame in DMA mode, causing SCLK stalls
between words due to BURST_LENGTH updates, which significantly impacts
performance.

To improve throughput, configure BURST_LENGTH as large as possible (up to
512 bytes per frame) instead of word length. This avoids delays between
words. When transfer length is not 4-byte aligned, use bounce buffers to
align data for DMA. TX uses aligned words for TXFIFO, while RX trims DMA
buffer data after transfer completion.

Introduce a new dma_package structure to store:
  1. BURST_LENGTH values for each DMA request
  2. Variables for DMA submission
  3. DMA transmission length and actual data length

Handle three cases:
  - len <= 512 bytes: one package, BURST_LENGTH = len * 8 - 1
  - len > 512 and aligned: one package, BURST_LENGTH = max (512 bytes)
  - len > 512 and unaligned: two packages, second for tail data

Performance test (spidev_test @10MHz, 4KB):
  Before: tx/rx ~6651.9 kbps
  After:  tx/rx ~9922.2 kbps (~50% improvement)

For compatibility with slow SPI devices, add configurable word delay in
DMA mode. When word delay is set, dynamic burst is disabled and
BURST_LENGTH equals word length.

Signed-off-by: Carlos Song <[email protected]>
---
 drivers/spi/spi-imx.c | 413 ++++++++++++++++++++++++++++++++++++++----
 1 file changed, 377 insertions(+), 36 deletions(-)

diff --git a/drivers/spi/spi-imx.c b/drivers/spi/spi-imx.c
index 42f64d9535c9..045f4ffd680a 100644
--- a/drivers/spi/spi-imx.c
+++ b/drivers/spi/spi-imx.c
@@ -60,6 +60,7 @@ MODULE_PARM_DESC(polling_limit_us,
 #define MX51_ECSPI_CTRL_MAX_BURST      512
 /* The maximum bytes that IMX53_ECSPI can transfer in target mode.*/
 #define MX53_MAX_TRANSFER_BYTES                512
+#define BYTES_PER_32BITS_WORD          4
 
 enum spi_imx_devtype {
        IMX1_CSPI,
@@ -95,6 +96,16 @@ struct spi_imx_devtype_data {
        enum spi_imx_devtype devtype;
 };
 
+struct dma_data_package {
+       u32 cmd_word;
+       void *dma_rx_buf;
+       void *dma_tx_buf;
+       dma_addr_t dma_tx_addr;
+       dma_addr_t dma_rx_addr;
+       int dma_len;
+       int data_len;
+};
+
 struct spi_imx_data {
        struct spi_controller *controller;
        struct device *dev;
@@ -130,6 +141,9 @@ struct spi_imx_data {
        u32 wml;
        struct completion dma_rx_completion;
        struct completion dma_tx_completion;
+       size_t dma_package_num;
+       struct dma_data_package *dma_data __counted_by(dma_package_num);
+       int rx_offset;
 
        const struct spi_imx_devtype_data *devtype_data;
 };
@@ -189,6 +203,9 @@ MXC_SPI_BUF_TX(u16)
 MXC_SPI_BUF_RX(u32)
 MXC_SPI_BUF_TX(u32)
 
+/* Align to cache line to avoid swiotlo bounce */
+#define DMA_CACHE_ALIGNED_LEN(x) ALIGN((x), dma_get_cache_alignment())
+
 /* First entry is reserved, second entry is valid only if SDHC_SPIEN is set
  * (which is currently not the case in this driver)
  */
@@ -253,6 +270,14 @@ static bool spi_imx_can_dma(struct spi_controller 
*controller, struct spi_device
        if (transfer->len < spi_imx->devtype_data->fifo_size)
                return false;
 
+       /* DMA only can transmit data in bytes */
+       if (spi_imx->bits_per_word != 8 && spi_imx->bits_per_word != 16 &&
+           spi_imx->bits_per_word != 32)
+               return false;
+
+       if (transfer->len >= MAX_SDMA_BD_BYTES)
+               return false;
+
        spi_imx->dynamic_burst = 0;
 
        return true;
@@ -1398,8 +1423,6 @@ static int spi_imx_sdma_init(struct device *dev, struct 
spi_imx_data *spi_imx,
 
        init_completion(&spi_imx->dma_rx_completion);
        init_completion(&spi_imx->dma_tx_completion);
-       controller->can_dma = spi_imx_can_dma;
-       controller->max_dma_len = MAX_SDMA_BD_BYTES;
        spi_imx->controller->flags = SPI_CONTROLLER_MUST_RX |
                                         SPI_CONTROLLER_MUST_TX;
 
@@ -1437,10 +1460,259 @@ static int spi_imx_calculate_timeout(struct 
spi_imx_data *spi_imx, int size)
        return secs_to_jiffies(2 * timeout);
 }
 
+static void spi_imx_dma_unmap(struct spi_imx_data *spi_imx,
+                             struct dma_data_package *dma_data)
+{
+       struct device *tx_dev = spi_imx->controller->dma_tx->device->dev;
+       struct device *rx_dev = spi_imx->controller->dma_rx->device->dev;
+
+       dma_unmap_single(tx_dev, dma_data->dma_tx_addr,
+                        DMA_CACHE_ALIGNED_LEN(dma_data->dma_len),
+                        DMA_TO_DEVICE);
+       dma_unmap_single(rx_dev, dma_data->dma_rx_addr,
+                        DMA_CACHE_ALIGNED_LEN(dma_data->dma_len),
+                        DMA_FROM_DEVICE);
+}
+
+static void spi_imx_dma_rx_data_handle(struct spi_imx_data *spi_imx,
+                                      struct dma_data_package *dma_data, void 
*rx_buf,
+                                      bool word_delay)
+{
+       void *copy_ptr;
+       int unaligned;
+
+       /*
+        * On little-endian CPUs, adjust byte order:
+        * - Swap bytes when bpw = 8
+        * - Swap half-words when bpw = 16
+        * This ensures correct data ordering for DMA transfers.
+        */
+#ifdef __LITTLE_ENDIAN
+       if (!word_delay) {
+               unsigned int bytes_per_word = 
spi_imx_bytes_per_word(spi_imx->bits_per_word);
+               u32 *temp = dma_data->dma_rx_buf;
+
+               for (int i = 0; i < DIV_ROUND_UP(dma_data->dma_len, 
sizeof(*temp)); i++) {
+                       if (bytes_per_word == 1)
+                               swab32s(temp + i);
+                       else if (bytes_per_word == 2)
+                               swahw32s(temp + i);
+               }
+       }
+#endif
+
+       /*
+        * When dynamic burst enabled, DMA RX always receives 32-bit words from 
RXFIFO with
+        * buswidth = 4, but when data_len is not 4-bytes alignment, the RM 
shows when
+        * burst length = 32*n + m bits, a SPI burst contains the m LSB in 
first word and all
+        * 32 bits in other n words. So if garbage bytes in the first word, 
trim first word then
+        * copy the actual data to rx_buf.
+        */
+       if (dma_data->data_len % BYTES_PER_32BITS_WORD && !word_delay) {
+               unaligned = dma_data->data_len % BYTES_PER_32BITS_WORD;
+               copy_ptr = (u8 *)dma_data->dma_rx_buf + BYTES_PER_32BITS_WORD - 
unaligned;
+       } else {
+               copy_ptr = dma_data->dma_rx_buf;
+       }
+
+       memcpy(rx_buf, copy_ptr, dma_data->data_len);
+}
+
+static int spi_imx_dma_map(struct spi_imx_data *spi_imx,
+                          struct dma_data_package *dma_data)
+{
+       struct spi_controller *controller = spi_imx->controller;
+       struct device *tx_dev = controller->dma_tx->device->dev;
+       struct device *rx_dev = controller->dma_rx->device->dev;
+       int ret;
+
+       dma_data->dma_tx_addr = dma_map_single(tx_dev, dma_data->dma_tx_buf,
+                                              
DMA_CACHE_ALIGNED_LEN(dma_data->dma_len),
+                                              DMA_TO_DEVICE);
+       ret = dma_mapping_error(tx_dev, dma_data->dma_tx_addr);
+       if (ret < 0) {
+               dev_err(spi_imx->dev, "DMA TX map failed %d\n", ret);
+               return ret;
+       }
+
+       dma_data->dma_rx_addr = dma_map_single(rx_dev, dma_data->dma_rx_buf,
+                                              
DMA_CACHE_ALIGNED_LEN(dma_data->dma_len),
+                                              DMA_FROM_DEVICE);
+       ret = dma_mapping_error(rx_dev, dma_data->dma_rx_addr);
+       if (ret < 0) {
+               dev_err(spi_imx->dev, "DMA RX map failed %d\n", ret);
+               dma_unmap_single(tx_dev, dma_data->dma_tx_addr,
+                                DMA_CACHE_ALIGNED_LEN(dma_data->dma_len),
+                                DMA_TO_DEVICE);
+               return ret;
+       }
+
+       return 0;
+}
+
+static int spi_imx_dma_tx_data_handle(struct spi_imx_data *spi_imx,
+                                     struct dma_data_package *dma_data,
+                                     const void *tx_buf,
+                                     bool word_delay)
+{
+       void *copy_ptr;
+       int unaligned;
+
+       if (word_delay) {
+               dma_data->dma_len = dma_data->data_len;
+       } else {
+               /*
+                * As per the reference manual, when burst length = 32*n + m 
bits, ECSPI
+                * sends m LSB bits in the first word, followed by n full 
32-bit words.
+                * Since actual data may not be 4-byte aligned, allocate DMA 
TX/RX buffers
+                * to ensure alignment. For TX, DMA pushes 4-byte aligned words 
to TXFIFO,
+                * while ECSPI uses BURST_LENGTH settings to maintain correct 
bit count.
+                * For RX, DMA always receives 32-bit words from RXFIFO, when 
data len is
+                * not 4-byte aligned, trim the first word to drop garbage 
bytes, then group
+                * all transfer DMA bounse buffer and copy all valid data to 
rx_buf.
+                */
+               dma_data->dma_len = ALIGN(dma_data->data_len, 
BYTES_PER_32BITS_WORD);
+       }
+
+       dma_data->dma_tx_buf = kzalloc(dma_data->dma_len, GFP_KERNEL);
+       if (!dma_data->dma_tx_buf)
+               return -ENOMEM;
+
+       dma_data->dma_rx_buf = kzalloc(dma_data->dma_len, GFP_KERNEL);
+       if (!dma_data->dma_rx_buf) {
+               kfree(dma_data->dma_tx_buf);
+               return -ENOMEM;
+       }
+
+       if (dma_data->data_len % BYTES_PER_32BITS_WORD && !word_delay) {
+               unaligned = dma_data->data_len % BYTES_PER_32BITS_WORD;
+               copy_ptr = (u8 *)dma_data->dma_tx_buf + BYTES_PER_32BITS_WORD - 
unaligned;
+       } else {
+               copy_ptr = dma_data->dma_tx_buf;
+       }
+
+       memcpy(copy_ptr, tx_buf, dma_data->data_len);
+
+       /*
+        * When word_delay is enabled, DMA transfers an entire word in one 
minor loop.
+        * In this case, no data requires additional handling.
+        */
+       if (word_delay)
+               return 0;
+
+#ifdef __LITTLE_ENDIAN
+       /*
+        * On little-endian CPUs, adjust byte order:
+        * - Swap bytes when bpw = 8
+        * - Swap half-words when bpw = 16
+        * This ensures correct data ordering for DMA transfers.
+        */
+       unsigned int bytes_per_word = 
spi_imx_bytes_per_word(spi_imx->bits_per_word);
+       u32 *temp = dma_data->dma_tx_buf;
+
+       for (int i = 0; i < DIV_ROUND_UP(dma_data->dma_len, sizeof(*temp)); 
i++) {
+               if (bytes_per_word == 1)
+                       swab32s(temp + i);
+               else if (bytes_per_word == 2)
+                       swahw32s(temp + i);
+       }
+#endif
+
+       return 0;
+}
+
+static int spi_imx_dma_data_prepare(struct spi_imx_data *spi_imx,
+                                   struct spi_transfer *transfer,
+                                   bool word_delay)
+{
+       u32 pre_bl, tail_bl;
+       u32 ctrl;
+       int ret;
+
+       /*
+        * ECSPI supports a maximum burst of 512 bytes. When xfer->len exceeds 
512
+        * and is not a multiple of 512, a tail transfer is required. 
BURST_LEGTH
+        * is used for SPI HW to maintain correct bit count. BURST_LENGTH should
+        * update with data length. After DMA request submit, SPI can not 
update the
+        * BURST_LENGTH, in this case, we must split two package, update the 
register
+        * then setup second DMA transfer.
+        */
+       ctrl = readl(spi_imx->base + MX51_ECSPI_CTRL);
+       if (word_delay) {
+               /*
+                * When SPI IMX need to support word delay, according to 
"Sample Period Control
+                * Register" shows, The Sample Period Control Register 
(ECSPI_PERIODREG)
+                * provides software a way to insert delays (wait states) 
between consecutive
+                * SPI transfers. As a result, ECSPI can only transfer one word 
per frame, and
+                * the delay occurs between frames.
+                */
+               spi_imx->dma_package_num = 1;
+               pre_bl = spi_imx->bits_per_word - 1;
+       } else if (transfer->len <= MX51_ECSPI_CTRL_MAX_BURST) {
+               spi_imx->dma_package_num = 1;
+               pre_bl = transfer->len * BITS_PER_BYTE - 1;
+       } else if (!(transfer->len % MX51_ECSPI_CTRL_MAX_BURST)) {
+               spi_imx->dma_package_num = 1;
+               pre_bl = MX51_ECSPI_CTRL_MAX_BURST * BITS_PER_BYTE - 1;
+       } else {
+               spi_imx->dma_package_num = 2;
+               pre_bl = MX51_ECSPI_CTRL_MAX_BURST * BITS_PER_BYTE - 1;
+               tail_bl = (transfer->len % MX51_ECSPI_CTRL_MAX_BURST) * 
BITS_PER_BYTE - 1;
+       }
+
+       spi_imx->dma_data = kmalloc_array(spi_imx->dma_package_num,
+                                         sizeof(struct dma_data_package),
+                                         GFP_KERNEL | __GFP_ZERO);
+       if (!spi_imx->dma_data) {
+               dev_err(spi_imx->dev, "Failed to allocate DMA package 
buffer!\n");
+               return -ENOMEM;
+       }
+
+       if (spi_imx->dma_package_num == 1) {
+               ctrl &= ~MX51_ECSPI_CTRL_BL_MASK;
+               ctrl |= pre_bl << MX51_ECSPI_CTRL_BL_OFFSET;
+               spi_imx->dma_data[0].cmd_word = ctrl;
+               spi_imx->dma_data[0].data_len = transfer->len;
+               ret = spi_imx_dma_tx_data_handle(spi_imx, 
&spi_imx->dma_data[0], transfer->tx_buf,
+                                                word_delay);
+               if (ret) {
+                       kfree(spi_imx->dma_data);
+                       return ret;
+               }
+       } else {
+               ctrl &= ~MX51_ECSPI_CTRL_BL_MASK;
+               ctrl |= pre_bl << MX51_ECSPI_CTRL_BL_OFFSET;
+               spi_imx->dma_data[0].cmd_word = ctrl;
+               spi_imx->dma_data[0].data_len = round_down(transfer->len,
+                                                          
MX51_ECSPI_CTRL_MAX_BURST);
+               ret = spi_imx_dma_tx_data_handle(spi_imx, 
&spi_imx->dma_data[0], transfer->tx_buf,
+                                                false);
+               if (ret) {
+                       kfree(spi_imx->dma_data);
+                       return ret;
+               }
+
+               ctrl &= ~MX51_ECSPI_CTRL_BL_MASK;
+               ctrl |= tail_bl << MX51_ECSPI_CTRL_BL_OFFSET;
+               spi_imx->dma_data[1].cmd_word = ctrl;
+               spi_imx->dma_data[1].data_len = transfer->len % 
MX51_ECSPI_CTRL_MAX_BURST;
+               ret = spi_imx_dma_tx_data_handle(spi_imx, &spi_imx->dma_data[1],
+                                                transfer->tx_buf + 
spi_imx->dma_data[0].data_len,
+                                                false);
+               if (ret) {
+                       kfree(spi_imx->dma_data[0].dma_tx_buf);
+                       kfree(spi_imx->dma_data[0].dma_rx_buf);
+                       kfree(spi_imx->dma_data);
+               }
+       }
+
+       return 0;
+}
+
 static int spi_imx_dma_submit(struct spi_imx_data *spi_imx,
+                             struct dma_data_package *dma_data,
                              struct spi_transfer *transfer)
 {
-       struct sg_table *tx = &transfer->tx_sg, *rx = &transfer->rx_sg;
        struct spi_controller *controller = spi_imx->controller;
        struct dma_async_tx_descriptor *desc_tx, *desc_rx;
        unsigned long transfer_timeout;
@@ -1451,9 +1723,9 @@ static int spi_imx_dma_submit(struct spi_imx_data 
*spi_imx,
         * The TX DMA setup starts the transfer, so make sure RX is configured
         * before TX.
         */
-       desc_rx = dmaengine_prep_slave_sg(controller->dma_rx,
-                                         rx->sgl, rx->nents, DMA_DEV_TO_MEM,
-                                         DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
+       desc_rx = dmaengine_prep_slave_single(controller->dma_rx, 
dma_data->dma_rx_addr,
+                                             dma_data->dma_len, DMA_DEV_TO_MEM,
+                                             DMA_PREP_INTERRUPT | 
DMA_CTRL_ACK);
        if (!desc_rx) {
                transfer->error |= SPI_TRANS_FAIL_NO_START;
                return -EINVAL;
@@ -1471,9 +1743,9 @@ static int spi_imx_dma_submit(struct spi_imx_data 
*spi_imx,
        reinit_completion(&spi_imx->dma_rx_completion);
        dma_async_issue_pending(controller->dma_rx);
 
-       desc_tx = dmaengine_prep_slave_sg(controller->dma_tx,
-                                         tx->sgl, tx->nents, DMA_MEM_TO_DEV,
-                                         DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
+       desc_tx = dmaengine_prep_slave_single(controller->dma_tx, 
dma_data->dma_tx_addr,
+                                             dma_data->dma_len, DMA_MEM_TO_DEV,
+                                             DMA_PREP_INTERRUPT | 
DMA_CTRL_ACK);
        if (!desc_tx)
                goto dmaengine_terminate_rx;
 
@@ -1521,16 +1793,16 @@ static int spi_imx_dma_submit(struct spi_imx_data 
*spi_imx,
 }
 
 static void spi_imx_dma_max_wml_find(struct spi_imx_data *spi_imx,
-                                    struct spi_transfer *transfer)
+                                    struct dma_data_package *dma_data,
+                                    bool word_delay)
 {
-       struct sg_table *rx = &transfer->rx_sg;
-       struct scatterlist *last_sg = sg_last(rx->sgl, rx->nents);
-       unsigned int bytes_per_word, i;
+       unsigned int bytes_per_word = word_delay ?
+                                     
spi_imx_bytes_per_word(spi_imx->bits_per_word) :
+                                     BYTES_PER_32BITS_WORD;
+       unsigned int i;
 
-       /* Get the right burst length from the last sg to ensure no tail data */
-       bytes_per_word = spi_imx_bytes_per_word(transfer->bits_per_word);
        for (i = spi_imx->devtype_data->fifo_size / 2; i > 0; i--) {
-               if (!(sg_dma_len(last_sg) % (i * bytes_per_word)))
+               if (!dma_data->dma_len % (i * bytes_per_word))
                        break;
        }
        /* Use 1 as wml in case no available burst length got */
@@ -1540,25 +1812,29 @@ static void spi_imx_dma_max_wml_find(struct 
spi_imx_data *spi_imx,
        spi_imx->wml = i;
 }
 
-static int spi_imx_dma_configure(struct spi_controller *controller)
+static int spi_imx_dma_configure(struct spi_controller *controller, bool 
word_delay)
 {
        int ret;
        enum dma_slave_buswidth buswidth;
        struct dma_slave_config rx = {}, tx = {};
        struct spi_imx_data *spi_imx = spi_controller_get_devdata(controller);
 
-       switch (spi_imx_bytes_per_word(spi_imx->bits_per_word)) {
-       case 4:
+       if (word_delay) {
+               switch (spi_imx_bytes_per_word(spi_imx->bits_per_word)) {
+               case 4:
+                       buswidth = DMA_SLAVE_BUSWIDTH_4_BYTES;
+                       break;
+               case 2:
+                       buswidth = DMA_SLAVE_BUSWIDTH_2_BYTES;
+                       break;
+               case 1:
+                       buswidth = DMA_SLAVE_BUSWIDTH_1_BYTE;
+                       break;
+               default:
+                       return -EINVAL;
+               }
+       } else {
                buswidth = DMA_SLAVE_BUSWIDTH_4_BYTES;
-               break;
-       case 2:
-               buswidth = DMA_SLAVE_BUSWIDTH_2_BYTES;
-               break;
-       case 1:
-               buswidth = DMA_SLAVE_BUSWIDTH_1_BYTE;
-               break;
-       default:
-               return -EINVAL;
        }
 
        tx.direction = DMA_MEM_TO_DEV;
@@ -1584,15 +1860,17 @@ static int spi_imx_dma_configure(struct spi_controller 
*controller)
        return 0;
 }
 
-static int spi_imx_dma_transfer(struct spi_imx_data *spi_imx,
-                               struct spi_transfer *transfer)
+static int spi_imx_dma_package_transfer(struct spi_imx_data *spi_imx,
+                                       struct dma_data_package *dma_data,
+                                       struct spi_transfer *transfer,
+                                       bool word_delay)
 {
        struct spi_controller *controller = spi_imx->controller;
        int ret;
 
-       spi_imx_dma_max_wml_find(spi_imx, transfer);
+       spi_imx_dma_max_wml_find(spi_imx, dma_data, word_delay);
 
-       ret = spi_imx_dma_configure(controller);
+       ret = spi_imx_dma_configure(controller, word_delay);
        if (ret)
                goto dma_failure_no_start;
 
@@ -1603,10 +1881,17 @@ static int spi_imx_dma_transfer(struct spi_imx_data 
*spi_imx,
        }
        spi_imx->devtype_data->setup_wml(spi_imx);
 
-       ret = spi_imx_dma_submit(spi_imx, transfer);
+       ret = spi_imx_dma_submit(spi_imx, dma_data, transfer);
        if (ret)
                return ret;
 
+       /* Trim the DMA RX buffer and copy the actual data to rx_buf */
+       dma_sync_single_for_cpu(controller->dma_rx->device->dev, 
dma_data->dma_rx_addr,
+                               dma_data->dma_len, DMA_FROM_DEVICE);
+       spi_imx_dma_rx_data_handle(spi_imx, dma_data, transfer->rx_buf + 
spi_imx->rx_offset,
+                                  word_delay);
+       spi_imx->rx_offset += dma_data->data_len;
+
        return 0;
 /* fallback to pio */
 dma_failure_no_start:
@@ -1614,6 +1899,57 @@ static int spi_imx_dma_transfer(struct spi_imx_data 
*spi_imx,
        return ret;
 }
 
+static int spi_imx_dma_transfer(struct spi_imx_data *spi_imx,
+                               struct spi_transfer *transfer)
+{
+       bool word_delay = transfer->word_delay.value != 0;
+       int ret;
+       int i;
+
+       ret = spi_imx_dma_data_prepare(spi_imx, transfer, word_delay);
+       if (ret < 0) {
+               transfer->error |= SPI_TRANS_FAIL_NO_START;
+               dev_err(spi_imx->dev, "DMA data prepare fail\n");
+               goto fallback_pio;
+       }
+
+       spi_imx->rx_offset = 0;
+
+       /* Each dma_package performs a separate DMA transfer once */
+       for (i = 0; i < spi_imx->dma_package_num; i++) {
+               ret = spi_imx_dma_map(spi_imx, &spi_imx->dma_data[i]);
+               if (ret < 0) {
+                       if (i == 0)
+                               transfer->error |= SPI_TRANS_FAIL_NO_START;
+                       dev_err(spi_imx->dev, "DMA map fail\n");
+                       break;
+               }
+
+               /* Update the CTRL register BL field */
+               writel(spi_imx->dma_data[i].cmd_word, spi_imx->base + 
MX51_ECSPI_CTRL);
+
+               ret = spi_imx_dma_package_transfer(spi_imx, 
&spi_imx->dma_data[i],
+                                                  transfer, word_delay);
+
+               /* Whether the dma transmission is successful or not, dma unmap 
is necessary */
+               spi_imx_dma_unmap(spi_imx, &spi_imx->dma_data[i]);
+
+               if (ret < 0) {
+                       dev_dbg(spi_imx->dev, "DMA %d transfer not really 
finish\n", i);
+                       break;
+               }
+       }
+
+       for (int j = 0; j < spi_imx->dma_package_num; j++) {
+               kfree(spi_imx->dma_data[j].dma_tx_buf);
+               kfree(spi_imx->dma_data[j].dma_rx_buf);
+       }
+       kfree(spi_imx->dma_data);
+
+fallback_pio:
+       return ret;
+}
+
 static int spi_imx_pio_transfer(struct spi_device *spi,
                                struct spi_transfer *transfer)
 {
@@ -1780,9 +2116,14 @@ static int spi_imx_transfer_one(struct spi_controller 
*controller,
         * transfer, the SPI transfer has already been mapped, so we
         * have to do the DMA transfer here.
         */
-       if (spi_imx->usedma)
-               return spi_imx_dma_transfer(spi_imx, transfer);
-
+       if (spi_imx->usedma) {
+               ret = spi_imx_dma_transfer(spi_imx, transfer);
+               if (transfer->error & SPI_TRANS_FAIL_NO_START) {
+                       spi_imx->usedma = false;
+                       return spi_imx_pio_transfer(spi, transfer);
+               }
+               return ret;
+       }
        /* run in polling mode for short transfers */
        if (transfer->len == 1 || (polling_limit_us &&
                                   spi_imx_transfer_estimate_time_us(transfer) 
< polling_limit_us))
-- 
2.34.1


Reply via email to