[PATCH V3] net: dwc_eth_qos: Pad descriptors to cacheline size

classic Classic list List threaded Threaded
4 messages Options
Reply | Threaded
Open this post in threaded view
|

[PATCH V3] net: dwc_eth_qos: Pad descriptors to cacheline size

Marek Vasut-3
The DWMAC4 IP has the possibility to skip up to 7 AXI bus width size words
after the descriptor. Use this to pad the descriptors to cacheline size and
remove the need for noncached memory altogether. Moreover, this lets Tegra
use the generic cache flush / invalidate operations.

Signed-off-by: Marek Vasut <[hidden email]>
Cc: Joe Hershberger <[hidden email]>
Cc: Patrice Chotard <[hidden email]>
Cc: Patrick Delaunay <[hidden email]>
Cc: Ramon Fried <[hidden email]>
Cc: Stephen Warren <[hidden email]>
---
V2: Consider AXI bus width, disable noncached memory on STM32MP1 and Tegra
V3: - Replace max() with ALIGN()
    - Keep noncached memory enabled on tegra for r8169
---
 drivers/net/dwc_eth_qos.c  | 129 +++++++++++++++----------------------
 include/configs/stm32mp1.h |   1 -
 2 files changed, 51 insertions(+), 79 deletions(-)

diff --git a/drivers/net/dwc_eth_qos.c b/drivers/net/dwc_eth_qos.c
index db1102562f6..dd0d64f7772 100644
--- a/drivers/net/dwc_eth_qos.c
+++ b/drivers/net/dwc_eth_qos.c
@@ -209,6 +209,7 @@ struct eqos_dma_regs {
 #define EQOS_DMA_SYSBUS_MODE_BLEN8 BIT(2)
 #define EQOS_DMA_SYSBUS_MODE_BLEN4 BIT(1)
 
+#define EQOS_DMA_CH0_CONTROL_DSL_SHIFT 18
 #define EQOS_DMA_CH0_CONTROL_PBLX8 BIT(16)
 
 #define EQOS_DMA_CH0_TX_CONTROL_TXPBL_SHIFT 16
@@ -239,37 +240,15 @@ struct eqos_tegra186_regs {
 #define EQOS_AUTO_CAL_STATUS_ACTIVE BIT(31)
 
 /* Descriptors */
-
-#define EQOS_DESCRIPTOR_WORDS 4
-#define EQOS_DESCRIPTOR_SIZE (EQOS_DESCRIPTOR_WORDS * 4)
 /* We assume ARCH_DMA_MINALIGN >= 16; 16 is the EQOS HW minimum */
 #define EQOS_DESCRIPTOR_ALIGN ARCH_DMA_MINALIGN
 #define EQOS_DESCRIPTORS_TX 4
 #define EQOS_DESCRIPTORS_RX 4
 #define EQOS_DESCRIPTORS_NUM (EQOS_DESCRIPTORS_TX + EQOS_DESCRIPTORS_RX)
-#define EQOS_DESCRIPTORS_SIZE ALIGN(EQOS_DESCRIPTORS_NUM * \
-      EQOS_DESCRIPTOR_SIZE, ARCH_DMA_MINALIGN)
 #define EQOS_BUFFER_ALIGN ARCH_DMA_MINALIGN
 #define EQOS_MAX_PACKET_SIZE ALIGN(1568, ARCH_DMA_MINALIGN)
 #define EQOS_RX_BUFFER_SIZE (EQOS_DESCRIPTORS_RX * EQOS_MAX_PACKET_SIZE)
 
-/*
- * Warn if the cache-line size is larger than the descriptor size. In such
- * cases the driver will likely fail because the CPU needs to flush the cache
- * when requeuing RX buffers, therefore descriptors written by the hardware
- * may be discarded. Architectures with full IO coherence, such as x86, do not
- * experience this issue, and hence are excluded from this condition.
- *
- * This can be fixed by defining CONFIG_SYS_NONCACHED_MEMORY which will cause
- * the driver to allocate descriptors from a pool of non-cached memory.
- */
-#if EQOS_DESCRIPTOR_SIZE < ARCH_DMA_MINALIGN
-#if !defined(CONFIG_SYS_NONCACHED_MEMORY) && \
- !CONFIG_IS_ENABLED(SYS_DCACHE_OFF) && !defined(CONFIG_X86)
-#warning Cache line size is larger than descriptor size
-#endif
-#endif
-
 struct eqos_desc {
  u32 des0;
  u32 des1;
@@ -282,12 +261,17 @@ struct eqos_desc {
 #define EQOS_DESC3_LD BIT(28)
 #define EQOS_DESC3_BUF1V BIT(24)
 
+#define EQOS_AXI_WIDTH_32 4
+#define EQOS_AXI_WIDTH_64 8
+#define EQOS_AXI_WIDTH_128 16
+
 struct eqos_config {
  bool reg_access_always_ok;
  int mdio_wait;
  int swr_wait;
  int config_mac;
  int config_mac_mdio;
+ unsigned int axi_bus_width;
  phy_interface_t (*interface)(struct udevice *dev);
  struct eqos_ops *ops;
 };
@@ -330,9 +314,8 @@ struct eqos_priv {
  int phyaddr;
  u32 max_speed;
  void *descs;
- struct eqos_desc *tx_descs;
- struct eqos_desc *rx_descs;
  int tx_desc_idx, rx_desc_idx;
+ unsigned int desc_size;
  void *tx_dma_buf;
  void *rx_dma_buf;
  void *rx_pkt;
@@ -358,63 +341,42 @@ struct eqos_priv {
  * not have the same constraints since they are 1536 bytes large, so they
  * are unlikely to share cache-lines.
  */
-static void *eqos_alloc_descs(unsigned int num)
+static void *eqos_alloc_descs(struct eqos_priv *eqos, unsigned int num)
 {
-#ifdef CONFIG_SYS_NONCACHED_MEMORY
- return (void *)noncached_alloc(EQOS_DESCRIPTORS_SIZE,
-      EQOS_DESCRIPTOR_ALIGN);
-#else
- return memalign(EQOS_DESCRIPTOR_ALIGN, EQOS_DESCRIPTORS_SIZE);
-#endif
+ eqos->desc_size = ALIGN(sizeof(struct eqos_desc),
+ (unsigned int)ARCH_DMA_MINALIGN);
+
+ return memalign(eqos->desc_size, num * eqos->desc_size);
 }
 
 static void eqos_free_descs(void *descs)
 {
-#ifdef CONFIG_SYS_NONCACHED_MEMORY
- /* FIXME: noncached_alloc() has no opposite */
-#else
  free(descs);
-#endif
 }
 
-static void eqos_inval_desc_tegra186(void *desc)
+static struct eqos_desc *eqos_get_desc(struct eqos_priv *eqos,
+       unsigned int num, bool rx)
 {
-#ifndef CONFIG_SYS_NONCACHED_MEMORY
- unsigned long start = (unsigned long)desc & ~(ARCH_DMA_MINALIGN - 1);
- unsigned long end = ALIGN(start + EQOS_DESCRIPTOR_SIZE,
-  ARCH_DMA_MINALIGN);
-
- invalidate_dcache_range(start, end);
-#endif
+ return eqos->descs +
+ ((rx ? EQOS_DESCRIPTORS_TX : 0) + num) * eqos->desc_size;
 }
 
 static void eqos_inval_desc_generic(void *desc)
 {
-#ifndef CONFIG_SYS_NONCACHED_MEMORY
- unsigned long start = rounddown((unsigned long)desc, ARCH_DMA_MINALIGN);
- unsigned long end = roundup((unsigned long)desc + EQOS_DESCRIPTOR_SIZE,
-    ARCH_DMA_MINALIGN);
+ unsigned long start = (unsigned long)desc;
+ unsigned long end = ALIGN(start + sizeof(struct eqos_desc),
+  ARCH_DMA_MINALIGN);
 
  invalidate_dcache_range(start, end);
-#endif
-}
-
-static void eqos_flush_desc_tegra186(void *desc)
-{
-#ifndef CONFIG_SYS_NONCACHED_MEMORY
- flush_cache((unsigned long)desc, EQOS_DESCRIPTOR_SIZE);
-#endif
 }
 
 static void eqos_flush_desc_generic(void *desc)
 {
-#ifndef CONFIG_SYS_NONCACHED_MEMORY
- unsigned long start = rounddown((unsigned long)desc, ARCH_DMA_MINALIGN);
- unsigned long end = roundup((unsigned long)desc + EQOS_DESCRIPTOR_SIZE,
-    ARCH_DMA_MINALIGN);
+ unsigned long start = (unsigned long)desc;
+ unsigned long end = ALIGN(start + sizeof(struct eqos_desc),
+  ARCH_DMA_MINALIGN);
 
  flush_dcache_range(start, end);
-#endif
 }
 
 static void eqos_inval_buffer_tegra186(void *buf, size_t size)
@@ -1167,6 +1129,7 @@ static int eqos_start(struct udevice *dev)
  ulong rate;
  u32 val, tx_fifo_sz, rx_fifo_sz, tqs, rqs, pbl;
  ulong last_rx_desc;
+ ulong desc_pad;
 
  debug("%s(dev=%p):\n", __func__, dev);
 
@@ -1405,8 +1368,12 @@ static int eqos_start(struct udevice *dev)
  EQOS_MAX_PACKET_SIZE <<
  EQOS_DMA_CH0_RX_CONTROL_RBSZ_SHIFT);
 
+ desc_pad = (eqos->desc_size - sizeof(struct eqos_desc)) /
+   eqos->config->axi_bus_width;
+
  setbits_le32(&eqos->dma_regs->ch0_control,
-     EQOS_DMA_CH0_CONTROL_PBLX8);
+     EQOS_DMA_CH0_CONTROL_PBLX8 |
+     (desc_pad << EQOS_DMA_CH0_CONTROL_DSL_SHIFT));
 
  /*
  * Burst length must be < 1/2 FIFO size.
@@ -1435,9 +1402,15 @@ static int eqos_start(struct udevice *dev)
 
  /* Set up descriptors */
 
- memset(eqos->descs, 0, EQOS_DESCRIPTORS_SIZE);
+ memset(eqos->descs, 0, eqos->desc_size * EQOS_DESCRIPTORS_NUM);
+
+ for (i = 0; i < EQOS_DESCRIPTORS_TX; i++) {
+ struct eqos_desc *tx_desc = eqos_get_desc(eqos, i, false);
+ eqos->config->ops->eqos_flush_desc(tx_desc);
+ }
+
  for (i = 0; i < EQOS_DESCRIPTORS_RX; i++) {
- struct eqos_desc *rx_desc = &(eqos->rx_descs[i]);
+ struct eqos_desc *rx_desc = eqos_get_desc(eqos, i, true);
  rx_desc->des0 = (u32)(ulong)(eqos->rx_dma_buf +
      (i * EQOS_MAX_PACKET_SIZE));
  rx_desc->des3 = EQOS_DESC3_OWN | EQOS_DESC3_BUF1V;
@@ -1449,12 +1422,14 @@ static int eqos_start(struct udevice *dev)
  }
 
  writel(0, &eqos->dma_regs->ch0_txdesc_list_haddress);
- writel((ulong)eqos->tx_descs, &eqos->dma_regs->ch0_txdesc_list_address);
+ writel((ulong)eqos_get_desc(eqos, 0, false),
+ &eqos->dma_regs->ch0_txdesc_list_address);
  writel(EQOS_DESCRIPTORS_TX - 1,
        &eqos->dma_regs->ch0_txdesc_ring_length);
 
  writel(0, &eqos->dma_regs->ch0_rxdesc_list_haddress);
- writel((ulong)eqos->rx_descs, &eqos->dma_regs->ch0_rxdesc_list_address);
+ writel((ulong)eqos_get_desc(eqos, 0, true),
+ &eqos->dma_regs->ch0_rxdesc_list_address);
  writel(EQOS_DESCRIPTORS_RX - 1,
        &eqos->dma_regs->ch0_rxdesc_ring_length);
 
@@ -1473,7 +1448,7 @@ static int eqos_start(struct udevice *dev)
  * that's not distinguishable from none of the descriptors being
  * available.
  */
- last_rx_desc = (ulong)&(eqos->rx_descs[(EQOS_DESCRIPTORS_RX - 1)]);
+ last_rx_desc = (ulong)eqos_get_desc(eqos, EQOS_DESCRIPTORS_RX - 1, true);
  writel(last_rx_desc, &eqos->dma_regs->ch0_rxdesc_tail_pointer);
 
  eqos->started = true;
@@ -1558,7 +1533,7 @@ static int eqos_send(struct udevice *dev, void *packet, int length)
  memcpy(eqos->tx_dma_buf, packet, length);
  eqos->config->ops->eqos_flush_buffer(eqos->tx_dma_buf, length);
 
- tx_desc = &(eqos->tx_descs[eqos->tx_desc_idx]);
+ tx_desc = eqos_get_desc(eqos, eqos->tx_desc_idx, false);
  eqos->tx_desc_idx++;
  eqos->tx_desc_idx %= EQOS_DESCRIPTORS_TX;
 
@@ -1573,7 +1548,7 @@ static int eqos_send(struct udevice *dev, void *packet, int length)
  tx_desc->des3 = EQOS_DESC3_OWN | EQOS_DESC3_FD | EQOS_DESC3_LD | length;
  eqos->config->ops->eqos_flush_desc(tx_desc);
 
- writel((ulong)(&(eqos->tx_descs[eqos->tx_desc_idx])),
+ writel((ulong)eqos_get_desc(eqos, eqos->tx_desc_idx, false),
  &eqos->dma_regs->ch0_txdesc_tail_pointer);
 
  for (i = 0; i < 1000000; i++) {
@@ -1596,7 +1571,7 @@ static int eqos_recv(struct udevice *dev, int flags, uchar **packetp)
 
  debug("%s(dev=%p, flags=%x):\n", __func__, dev, flags);
 
- rx_desc = &(eqos->rx_descs[eqos->rx_desc_idx]);
+ rx_desc = eqos_get_desc(eqos, eqos->rx_desc_idx, true);
  eqos->config->ops->eqos_inval_desc(rx_desc);
  if (rx_desc->des3 & EQOS_DESC3_OWN) {
  debug("%s: RX packet not available\n", __func__);
@@ -1631,7 +1606,7 @@ static int eqos_free_pkt(struct udevice *dev, uchar *packet, int length)
 
  eqos->config->ops->eqos_inval_buffer(packet, length);
 
- rx_desc = &(eqos->rx_descs[eqos->rx_desc_idx]);
+ rx_desc = eqos_get_desc(eqos, eqos->rx_desc_idx, true);
 
  rx_desc->des0 = 0;
  mb();
@@ -1663,17 +1638,12 @@ static int eqos_probe_resources_core(struct udevice *dev)
 
  debug("%s(dev=%p):\n", __func__, dev);
 
- eqos->descs = eqos_alloc_descs(EQOS_DESCRIPTORS_TX +
-       EQOS_DESCRIPTORS_RX);
+ eqos->descs = eqos_alloc_descs(eqos, EQOS_DESCRIPTORS_NUM);
  if (!eqos->descs) {
  debug("%s: eqos_alloc_descs() failed\n", __func__);
  ret = -ENOMEM;
  goto err;
  }
- eqos->tx_descs = (struct eqos_desc *)eqos->descs;
- eqos->rx_descs = (eqos->tx_descs + EQOS_DESCRIPTORS_TX);
- debug("%s: tx_descs=%p, rx_descs=%p\n", __func__, eqos->tx_descs,
-      eqos->rx_descs);
 
  eqos->tx_dma_buf = memalign(EQOS_BUFFER_ALIGN, EQOS_MAX_PACKET_SIZE);
  if (!eqos->tx_dma_buf) {
@@ -2083,8 +2053,8 @@ static const struct eth_ops eqos_ops = {
 };
 
 static struct eqos_ops eqos_tegra186_ops = {
- .eqos_inval_desc = eqos_inval_desc_tegra186,
- .eqos_flush_desc = eqos_flush_desc_tegra186,
+ .eqos_inval_desc = eqos_inval_desc_generic,
+ .eqos_flush_desc = eqos_flush_desc_generic,
  .eqos_inval_buffer = eqos_inval_buffer_tegra186,
  .eqos_flush_buffer = eqos_flush_buffer_tegra186,
  .eqos_probe_resources = eqos_probe_resources_tegra186,
@@ -2105,6 +2075,7 @@ static const struct eqos_config __maybe_unused eqos_tegra186_config = {
  .swr_wait = 10,
  .config_mac = EQOS_MAC_RXQ_CTRL0_RXQ0EN_ENABLED_DCB,
  .config_mac_mdio = EQOS_MAC_MDIO_ADDRESS_CR_20_35,
+ .axi_bus_width = EQOS_AXI_WIDTH_128,
  .interface = eqos_get_interface_tegra186,
  .ops = &eqos_tegra186_ops
 };
@@ -2132,6 +2103,7 @@ static const struct eqos_config __maybe_unused eqos_stm32_config = {
  .swr_wait = 50,
  .config_mac = EQOS_MAC_RXQ_CTRL0_RXQ0EN_ENABLED_AV,
  .config_mac_mdio = EQOS_MAC_MDIO_ADDRESS_CR_250_300,
+ .axi_bus_width = EQOS_AXI_WIDTH_64,
  .interface = eqos_get_interface_stm32,
  .ops = &eqos_stm32_ops
 };
@@ -2159,6 +2131,7 @@ struct eqos_config __maybe_unused eqos_imx_config = {
  .swr_wait = 50,
  .config_mac = EQOS_MAC_RXQ_CTRL0_RXQ0EN_ENABLED_DCB,
  .config_mac_mdio = EQOS_MAC_MDIO_ADDRESS_CR_250_300,
+ .axi_bus_width = EQOS_AXI_WIDTH_64,
  .interface = eqos_get_interface_imx,
  .ops = &eqos_imx_ops
 };
diff --git a/include/configs/stm32mp1.h b/include/configs/stm32mp1.h
index 1f6cb2919b5..43b6d7d8afc 100644
--- a/include/configs/stm32mp1.h
+++ b/include/configs/stm32mp1.h
@@ -69,7 +69,6 @@
 
 /* Ethernet need */
 #ifdef CONFIG_DWC_ETH_QOS
-#define CONFIG_SYS_NONCACHED_MEMORY (1 * SZ_1M) /* 1M */
 #define CONFIG_SERVERIP                 192.168.1.1
 #define CONFIG_BOOTP_SERVERIP
 #define CONFIG_SYS_AUTOLOAD "no"
--
2.29.2

Reply | Threaded
Open this post in threaded view
|

Re: [PATCH V3] net: dwc_eth_qos: Pad descriptors to cacheline size

Stephen Warren-2
On 1/7/21 3:12 AM, Marek Vasut wrote:
> The DWMAC4 IP has the possibility to skip up to 7 AXI bus width size words
> after the descriptor. Use this to pad the descriptors to cacheline size and
> remove the need for noncached memory altogether. Moreover, this lets Tegra
> use the generic cache flush / invalidate operations.

Tested-by: Stephen Warren <[hidden email]>
Reviewed-by: Stephen Warren <[hidden email]>
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH V3] net: dwc_eth_qos: Pad descriptors to cacheline size

Marek Vasut-3
On 1/7/21 5:33 PM, Stephen Warren wrote:
> On 1/7/21 3:12 AM, Marek Vasut wrote:
>> The DWMAC4 IP has the possibility to skip up to 7 AXI bus width size words
>> after the descriptor. Use this to pad the descriptors to cacheline size and
>> remove the need for noncached memory altogether. Moreover, this lets Tegra
>> use the generic cache flush / invalidate operations.
>
> Tested-by: Stephen Warren <[hidden email]>
> Reviewed-by: Stephen Warren <[hidden email]>

Thanks.

This also really needs a TB/RB from ST before this is applied.
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH V3] net: dwc_eth_qos: Pad descriptors to cacheline size

Patrice Chotard-2
Hi Marek

On 1/7/21 7:16 PM, Marek Vasut wrote:

> On 1/7/21 5:33 PM, Stephen Warren wrote:
>> On 1/7/21 3:12 AM, Marek Vasut wrote:
>>> The DWMAC4 IP has the possibility to skip up to 7 AXI bus width size words
>>> after the descriptor. Use this to pad the descriptors to cacheline size and
>>> remove the need for noncached memory altogether. Moreover, this lets Tegra
>>> use the generic cache flush / invalidate operations.
>>
>> Tested-by: Stephen Warren <[hidden email]>
>> Reviewed-by: Stephen Warren <[hidden email]>
>
> Thanks.
>
> This also really needs a TB/RB from ST before this is applied.

Tested-by: Patrice Chotard <[hidden email]>

Tested on a stm32mp157c-ev1 board

Thanks

Patrice