linux4.19.90 stmmac驱动源码分析(附图)
一.驱动兼容性
dwmac是众多网卡驱动中的一个通用驱动系列,里面包含了大部分能用到的多种系列网卡
- stm32
- sunxi
- ipq806x
- lpc18xx
- sti
- fpga
- …
准确地来说stmmac层封装了一些纯底层接口并方便上层调用,但是一个简单的项目并不需要如此多的配置,也并不需要单独开辟一个结构体为其赋值等。
这套驱动的入口就是各种dwmac-xxxxxx的网卡类型入口,需要传入一些平台数据(共有数据),并在platform层写入一些具体硬件类型的系列数据(半共有数据),之后将上述数据向下发送至stmmac层进行转化,上述数据被填充至网卡驱动的私有结构体中(私有数据),随后,stmmac层的各类函数对私有结构体中的数据进行读写,这些操作最后都会在底层寄存器中实现。
二.初始化
基于linux内核4.19.90代码分析
driver路径:$KERNEL_CODE//drivers/net/ethernet/stmicro/stmmac/*
1.基本数据结构
其中深蓝色为代码实例,为函数或已经初始化完毕的全局操作集合变量,可在代码中搜索,具体结构不再给出
2.初始化过程
该驱动一般不支持热插拔(或者我没看到,PCIe或USB总线不了解)。因此给出从dts设备树解析的过程,方便理解又跟代码契合较高
(1)设备树解析
platform_device
设备树dts文件,一般用在嵌入式设备上。因为嵌入式设备通常并不像PC那样总线众多。I2C,PCI,USB等等,甚至大多数情况下嵌入式设备并不包含总线。对于挂载的设备来说,需要一个通用的总线来挂载。在嵌入式系统里面,SoC系统中集成的独立的外设控制器、挂接在SoC内存空间的外设却不依附与此类总线。所以Linux驱动模型为了保持完整性,将这些设备挂在一条虚拟的总线上(platform总线),而不至于使得有些设备挂在总线上,另一些设备没有挂在总线上。
解析过程
这里只给出大体流程,后面会单独分一个模块讲解platform_device。
只需要知道在解析完设备树后,就会生成一个platform_device结构,里面的键值等都会放到resource里面去。
(2)匹配driver
在获得platform_device后,就会寻找驱动,如果匹配成功,将会把设备加入到platform总线上。
这个过程里面就会调用匹配driver的probe函数了。
//match 函数static inline int driver_match_device(struct device_driver *drv,struct device *dev){ return drv->bus->match ? drv->bus->match(dev, drv) : 1;}static int platform_match(struct device *dev, struct device_driver *drv){struct platform_device *pdev = to_platform_device(dev);struct platform_driver *pdrv = to_platform_driver(drv); /* match against the id table first */if (pdrv->id_table)return platform_match_id(pdrv->id_table, pdev) != NULL; /* fall-back to driver name match */return (strcmp(pdev->name, drv->name) == 0);}
(3)stmmac初始化
在代码中有很多driver,dwmac_xxx_driver,这里拿出最通用的看一下
//可以看到支持的网卡型号static const struct of_device_id dwmac_generic_match[] = {{ .compatible = \"st,spear600-gmac\"},{ .compatible = \"snps,dwmac-3.50a\"},{ .compatible = \"snps,dwmac-3.610\"},{ .compatible = \"snps,dwmac-3.70a\"},{ .compatible = \"snps,dwmac-3.710\"},{ .compatible = \"snps,dwmac-4.00\"},{ .compatible = \"snps,dwmac-4.10a\"},{ .compatible = \"snps,dwmac\"},{ .compatible = \"snps,dwxgmac-2.10\"},{ .compatible = \"snps,dwxgmac\"},{ }};static struct platform_driver dwmac_generic_driver = {.probe = dwmac_generic_probe,.remove = stmmac_pltfr_remove,.driver = {.name = STMMAC_RESOURCE_NAME,// 这个主要用于休眠,先不管.pm= &stmmac_pltfr_pm_ops,// of_match_ptr 返回输入的指针// 这个用于设备树,启动时网卡有一个compatible选项,就是用来匹配的.of_match_table = of_match_ptr(dwmac_generic_match),},};// 底层其实是 使用platform_driver_register和platform_driver_unregister注册了dwmac_generic_drivermodule_platform_driver(dwmac_generic_driver);
细节已给出,下面就来个图全部概括。
图:
过程:
[1] 匹配成功后,调用
int dwmac_generic_probe(struct platform_device *pdev)
[2] 调用 stmmac_get_platform_resources
ret = stmmac_get_platform_resources(pdev, &stmmac_res);int stmmac_get_platform_resources(struct platform_device *pdev, struct stmmac_resources *stmmac_res);
初始化irq信息, 将dts中的物理addr,映射为虚拟addr给虚拟地址用的.
stmmac_res->addr = devm_ioremap_resource(&pdev->dev, res);
[3] 调用 stmmac_probe_config_dt, 进一步将dts中的设备树文件配置信息的键值对,放入plat_stmmacenet_data中.
plat_dat = stmmac_probe_config_dt(pdev, &stmmac_res.mac);struct plat_stmmacenet_data *stmmac_probe_config_dt(struct platform_device *pdev, const char **mac);
也初始化了dmacfg ,也就是DMA_ENGINE的相关参数
struct stmmac_dma_cfg *dma_cfg;
如果是fixed-link,则of_phy_register_fixed_link直接注册,是一个虚拟的PHY设备
如果不是,就只是分配数据结构,并未绑定和申请MDIO_BUS
rc = stmmac_dt_phy(plat, np, &pdev->dev);static int stmmac_dt_phy(struct plat_stmmacenet_data *plat, struct device_node *np, struct device *dev){bool mdio = true;static const struct of_device_id need_mdio_ids[] = {{ .compatible = \"snps,dwc-qos-ethernet-4.10\" },{},};/* If phy-handle property is passed from DT, use it as the PHY */plat->phy_node = of_parse_phandle(np, \"phy-handle\", 0);if (plat->phy_node)dev_dbg(dev, \"Found phy-handle subnode\\n\");/* If phy-handle is not specified, check if we have a fixed-phy */ // 如果是fixed-link,就不需要再申请mdio_bus了if (!plat->phy_node && of_phy_is_fixed_link(np)) {if ((of_phy_register_fixed_link(np) < 0))return -ENODEV;dev_dbg(dev, \"Found fixed-link subnode\\n\");plat->phy_node = of_node_get(np);mdio = false;}if (of_match_node(need_mdio_ids, np)) {plat->mdio_node = of_get_child_by_name(np, \"mdio\");} else {/** * If snps,dwmac-mdio is passed from DT, always register * the MDIO */for_each_child_of_node(np, plat->mdio_node) {if (of_device_is_compatible(plat->mdio_node, \"snps,dwmac-mdio\"))break;}}if (plat->mdio_node) {dev_dbg(dev, \"Found MDIO subnode\\n\");mdio = true;}// 如果不是fixed-link,就生成一个mdio_bus_dataif (mdio)plat->mdio_bus_data =devm_kzalloc(dev, sizeof(struct stmmac_mdio_bus_data), GFP_KERNEL);return 0;}
[4] 基本配置初始化完毕, 使用上面的基本信息, 调用通用初始化接口
ret = stmmac_dvr_probe(&pdev->dev, plat_dat, &stmmac_res);int stmmac_dvr_probe(struct device *device, struct plat_stmmacenet_data *plat_dat, struct stmmac_resources *res);
[4.1] 调用 alloc_etherdev_mqs,申请网络设备net_device, 设置接收发送队列数目为8
ndev = alloc_etherdev_mqs(sizeof(struct stmmac_priv), MTL_MAX_TX_QUEUES, MTL_MAX_RX_QUEUES);struct net_device *alloc_etherdev_mqs(int sizeof_priv, unsigned int txqs, unsigned int rxqs);
[4.2] 将申请的 net_device 与上面的 platform_device 绑定
SET_NETDEV_DEV(ndev, device); priv = netdev_priv(ndev); priv->device = device; priv->dev = ndev;
[4.3] 设置网卡操作函数
stmmac_set_ethtool_ops(ndev); void stmmac_set_ethtool_ops(struct net_device *netdev){ netdev->ethtool_ops = &stmmac_ethtool_ops;}// 这里给出一下操作函数static const struct ethtool_ops stmmac_ethtool_ops = {.begin = stmmac_check_if_running,.get_drvinfo = stmmac_ethtool_getdrvinfo,.get_msglevel = stmmac_ethtool_getmsglevel,.set_msglevel = stmmac_ethtool_setmsglevel,.get_regs = stmmac_ethtool_gregs,.get_regs_len = stmmac_ethtool_get_regs_len,.get_link = ethtool_op_get_link,.nway_reset = phy_ethtool_nway_reset,.get_pauseparam = stmmac_get_pauseparam,.set_pauseparam = stmmac_set_pauseparam,.get_ethtool_stats = stmmac_get_ethtool_stats,.get_strings = stmmac_get_strings,.get_wol = stmmac_get_wol,.set_wol = stmmac_set_wol,.get_eee = stmmac_ethtool_op_get_eee,.set_eee = stmmac_ethtool_op_set_eee,.get_sset_count= stmmac_get_sset_count,.get_ts_info = stmmac_get_ts_info,.get_coalesce = stmmac_get_coalesce,.set_coalesce = stmmac_set_coalesce,.get_tunable = stmmac_get_tunable,.set_tunable = stmmac_set_tunable,.get_link_ksettings = stmmac_ethtool_get_link_ksettings,.set_link_ksettings = stmmac_ethtool_set_link_ksettings,};
[4.4] 申请workqueue,并将其任务置为stmmac_service_task,只执行重置网卡的任务,不知原因
priv->wq = create_singlethread_workqueue(\"stmmac_wq\");INIT_WORK(&priv->service_task, stmmac_service_task);
[4.5] 设置硬件参数
ret = stmmac_hw_init(priv);static int stmmac_hw_init(struct stmmac_priv *priv);
[4.5.1] 进一步设置操作集合,不同的硬件有不同的操作集合
ret = stmmac_hwif_init(priv);int stmmac_hwif_init(struct stmmac_priv *priv){ // 由于这是通用驱动,需要根据硬件设置不同属性 for (i = ARRAY_SIZE(stmmac_hw) - 1; i >= 0; i--) { // stmmac_hw是个全局数组,存储了每个硬件的特殊操作集合entry = &stmmac_hw[i]; // 判断那种类型的网卡if (needs_gmac ^ entry->gmac)continue;if (needs_gmac4 ^ entry->gmac4)continue;if (needs_xgmac ^ entry->xgmac)continue;/* Use synopsys_id var because some setups can override this */if (priv->synopsys_id < entry->min_id)continue;/* Only use generic HW helpers if needed */mac->desc = mac->desc ? : entry->desc;mac->dma = mac->dma ? : entry->dma;mac->mac = mac->mac ? : entry->mac;mac->ptp = mac->ptp ? : entry->hwtimestamp;mac->mode = mac->mode ? : entry->mode;mac->tc = mac->tc ? : entry->tc;// ..........各种赋值省略return 0;}}// 这里给出一个示例,具体看代码stmmac_hw[] = {/* NOTE: New HW versions shall go to the end of this table */{.gmac = false,.gmac4 = false,.xgmac = false,.min_id = 0,.regs = {.ptp_off = PTP_GMAC3_X_OFFSET,.mmc_off = MMC_GMAC3_X_OFFSET,},.desc = NULL,.dma = &dwmac100_dma_ops,.mac = &dwmac100_ops,.hwtimestamp = &stmmac_ptp,.mode = NULL,.tc = NULL,.setup = dwmac100_setup,.quirks = stmmac_dwmac1_quirks,}, // 后面还有很多
[4.6] 设置网络设备的操作函数,很重要,将是网卡启动的入口
ndev->netdev_ops = &stmmac_netdev_ops;static const struct net_device_ops stmmac_netdev_ops = {.ndo_open = stmmac_open,.ndo_start_xmit = stmmac_xmit,.ndo_stop = stmmac_release,.ndo_change_mtu = stmmac_change_mtu,.ndo_fix_features = stmmac_fix_features,.ndo_set_features = stmmac_set_features,.ndo_set_rx_mode = stmmac_set_rx_mode,.ndo_tx_timeout = stmmac_tx_timeout,.ndo_do_ioctl = stmmac_ioctl,.ndo_setup_tc = stmmac_setup_tc,.ndo_select_queue = stmmac_select_queue,#ifdef CONFIG_NET_POLL_CONTROLLER.ndo_poll_controller = stmmac_poll_controller,#endif.ndo_set_mac_address = stmmac_set_mac_address,};
[4.7] 初始化流控相关
ret = stmmac_tc_init(priv, priv);// 这个函数是个宏,比较特殊,后面会说
[4.8] 设置日志级别
priv->msg_enable = netif_msg_init(debug, default_msg_level);/* * 默认是 static const u32 default_msg_level = (NETIF_MSG_DRV | NETIF_MSG_PROBE |* NETIF_MSG_LINK | NETIF_MSG_IFUP |* NETIF_MSG_IFDOWN | NETIF_MSG_TIMER);*/ static inline u32 netif_msg_init(int debug_value, int default_msg_enable_bits)
[4.9] 初始化8个channel,并分别绑定一个NAPI
maxq = max(priv->plat->rx_queues_to_use, priv->plat->tx_queues_to_use);for (queue = 0; queue < maxq; queue++) {struct stmmac_channel *ch = &priv->channel[queue];ch->priv_data = priv;ch->index = queue;if (queue < priv->plat->rx_queues_to_use)ch->has_rx = true;if (queue < priv->plat->tx_queues_to_use)ch->has_tx = true; // 所以总共存在8个NAPInetif_napi_add(ndev, &ch->napi, stmmac_napi_poll, NAPI_POLL_WEIGHT);}
[4.10] 注册 PHY
if (priv->hw->pcs != STMMAC_PCS_RGMII && priv->hw->pcs != STMMAC_PCS_TBI && priv->hw->pcs != STMMAC_PCS_RTBI) {/* MDIO bus Registration */ret = stmmac_mdio_register(ndev);}int stmmac_mdio_register(struct net_device *ndev){int err = 0;struct mii_bus *new_bus;struct stmmac_priv *priv = netdev_priv(ndev);struct stmmac_mdio_bus_data *mdio_bus_data = priv->plat->mdio_bus_data;struct device_node *mdio_node = priv->plat->mdio_node;struct device *dev = ndev->dev.parent;int addr, found, max_addr; // 如果之前在stmmac_probe_config_dt中为fixed-link,这个应该是空的 // 如果不是,说明存在需要注册的PHY设备if (!mdio_bus_data)return 0;// 分配一个新的MII_BUSnew_bus = mdiobus_alloc();if (!new_bus)return -ENOMEM; // 设置PHY设备的IRQif (mdio_bus_data->irqs)memcpy(new_bus->irq, mdio_bus_data->irqs, sizeof(new_bus->irq));#ifdef CONFIG_OF// 这个宏其实代表了是否使用设备树if (priv->device->of_node)mdio_bus_data->reset_gpio = -1;#endifnew_bus->name = \"stmmac\";if (priv->plat->has_xgmac) { // 设置MDIO操作集合new_bus->read = &stmmac_xgmac2_mdio_read;new_bus->write = &stmmac_xgmac2_mdio_write;/* Right now only C22 phys are supported */max_addr = MII_XGMAC_MAX_C22ADDR + 1;/* Check if DT specified an unsupported phy addr */if (priv->plat->phy_addr > MII_XGMAC_MAX_C22ADDR)dev_err(dev, \"Unsupported phy_addr (max=%d)\\n\",MII_XGMAC_MAX_C22ADDR);} else {new_bus->read = &stmmac_mdio_read;new_bus->write = &stmmac_mdio_write;max_addr = PHY_MAX_ADDR;}new_bus->reset = &stmmac_mdio_reset;snprintf(new_bus->id, MII_BUS_ID_SIZE, \"%s-%x\", new_bus->name, priv->plat->bus_id);new_bus->priv = ndev;new_bus->phy_mask = mdio_bus_data->phy_mask;new_bus->parent = priv->device; // 注册MII总线,这个函数应该会搜寻mdio_node上的所有PHY设备,然后将其注册在MII总线上err = of_mdiobus_register(new_bus, mdio_node);if (err != 0) {dev_err(dev, \"Cannot register the MDIO bus\\n\");goto bus_register_fail;}if (priv->plat->phy_node || mdio_node)goto bus_register_done;found = 0; // 这个是遍历PHY设备地址,搜索PHY设备。不太明白,后面看for (addr = 0; addr < max_addr; addr++) {struct phy_device *phydev = mdiobus_get_phy(new_bus, addr);if (!phydev)continue;/* * If an IRQ was provided to be assigned after the bus probe, do it here. */if (!mdio_bus_data->irqs && (mdio_bus_data->probed_phy_irq > 0)) {new_bus->irq[addr] = mdio_bus_data->probed_phy_irq;phydev->irq = mdio_bus_data->probed_phy_irq;}/* * If we\'re going to bind the MAC to this PHY bus, and no PHY number was provided to the MAC, use the one probed here. */if (priv->plat->phy_addr == -1)priv->plat->phy_addr = addr;phy_attached_info(phydev);found = 1;}//................................}
[4.11] 注册 net_device
ret = register_netdev(ndev);int register_netdev(struct net_device *dev);
(4)总结
可以看到初始化过程,最主要的工作是设置了stmmac_priv结构体,初始化了net_device结构,并将dma以及mac操作集合赋值。
PHY驱动相关我暂时省略了,后面会补充
[1] 特殊的代码
有一族函数写的很特殊,但是很实用,可以借鉴一下
比如这些函数:
#define stmmac_mode_init(__priv, __args...) \\stmmac_do_void_callback(__priv, mode, init, __args)#define stmmac_is_jumbo_frm(__priv, __args...) \\stmmac_do_callback(__priv, mode, is_jumbo_frm, __args)#define stmmac_jumbo_frm(__priv, __args...) \\stmmac_do_callback(__priv, mode, jumbo_frm, __args)#define stmmac_set_16kib_bfsize(__priv, __args...) \\stmmac_do_callback(__priv, mode, set_16kib_bfsize, __args)#define stmmac_init_desc3(__priv, __args...) \\stmmac_do_void_callback(__priv, mode, init_desc3, __args)#define stmmac_do_void_callback(__priv, __module, __cname, __arg0, __args...) \\({ \\int __result = -EINVAL; \\if ((__priv)->hw->__module && (__priv)->hw->__module->__cname) { \\(__priv)->hw->__module->__cname((__arg0), ##__args); \\__result = 0; \\} \\__result; \\})可以看到,这样就完全隔绝了不同硬件对应操作函数的差异,也不需要什么数组或者enum,使代码很简洁,但同时,可读性是真的差。
三.正式部分
在上面已经介绍了大体的初始化流程,下面聚焦一下具体的DMA初始化,以及DMA的操作流程
前面[1]特殊的代码中已经提过,所有硬件的操作基本上都被隔绝在了stmmac_hwif_entry数组里面
我们如果想看所有的代码,显然是不现实的,而且很费劲。因此直接盯住一个死磕,才是最优解。这里拿出一个来:
static const struct stmmac_hwif_entry {bool gmac;bool gmac4;bool xgmac;u32 min_id;const struct stmmac_regs_off regs;const void *desc;// 操作硬件dma描述符的const void *dma;// 操作dma_engine的const void *mac;// 操作mac的const void *hwtimestamp;const void *mode;// 在dwmac410里面,mode操作集都是空的,只有早期的dwmac10/100/1000 有chain_mode 后面的就没有了const void *tc;// 流控函数,先不要管int (*setup)(struct stmmac_priv *priv);int (*quirks)(struct stmmac_priv *priv);} stmmac_hw[] = {.gmac = false,.gmac4 = true,.xgmac = false,.min_id = DWMAC_CORE_4_10,.regs = {.ptp_off = PTP_GMAC4_OFFSET,.mmc_off = MMC_GMAC4_OFFSET,},.desc = &dwmac4_desc_ops,.dma = &dwmac410_dma_ops,.mac = &dwmac410_ops,.hwtimestamp = &stmmac_ptp,.mode = &dwmac4_ring_mode_ops,.tc = NULL,.setup = dwmac4_setup,.quirks = NULL,}
1.初始化部分:
(1)初始化过程
初始化的过程就看这个就可以了,重点看dma的初始化,而mac初始化过程主要是根据设备树的信息写了mac寄存器
(2)stmmac_open
static int stmmac_open(struct net_device *dev){struct stmmac_priv *priv = netdev_priv(dev);u32 chan;int ret;// TBI,RTBI和MII一样,PHY和MAC交互方式的一种if (priv->hw->pcs != STMMAC_PCS_RGMII && priv->hw->pcs != STMMAC_PCS_TBI && priv->hw->pcs != STMMAC_PCS_RTBI) {// 主要是速率协商ret = stmmac_init_phy(dev);if (ret) {netdev_err(priv->dev, \"%s: Cannot attach to PHY (error: %d)\\n\", __func__, ret);return ret;}}/* Extra statistics */memset(&priv->xstats, 0, sizeof(struct stmmac_extra_stats));priv->xstats.threshold = tc; // 这个是buff_size, 默认是1536priv->dma_buf_sz = STMMAC_ALIGN(buf_sz);priv->rx_copybreak = STMMAC_RX_COPYBREAK; // 分配硬件dma描述符,队列在之前的初始化部分已经完成,8个队列,每个队列512个dma描述符, 并把这些描述符写入到dma_engine的IO地址中 // 以及512 * 8 * 2个(sk_buff *), 分别表示接收和发送的队列ret = alloc_dma_desc_resources(priv);if (ret < 0) {netdev_err(priv->dev, \"%s: DMA descriptors allocation failed\\n\", __func__);goto dma_desc_error;} // 注意这里有个很重要的区别,对于接收队列,在这里就已经申请了 512 * 8个sk_buff,并且映射到dma硬件描述符中了 // 而发送队列,什么都没做,只是一些置位操作ret = init_dma_desc_rings(dev, GFP_KERNEL);if (ret < 0) {netdev_err(priv->dev, \"%s: DMA descriptors initialization failed\\n\", __func__);goto init_error;} // 初始化硬件部分,主要是macret = stmmac_hw_setup(dev, true);if (ret < 0) {netdev_err(priv->dev, \"%s: Hw setup failed\\n\", __func__);goto init_error;}stmmac_init_tx_coalesce(priv);if (dev->phydev)phy_start(dev->phydev);/* Request the IRQ lines */ // irq号已经在解析设备树的时候分配好, 这里面是申请中断线 // 绑定对应的中断函数为stmmac_interruptret = request_irq(dev->irq, stmmac_interrupt, IRQF_SHARED, dev->name, dev);if (unlikely(ret < 0)) {netdev_err(priv->dev, \"%s: ERROR: allocating the IRQ %d (error: %d)\\n\", __func__, dev->irq, ret);goto irq_error;}······ // 调用napi_enable使能最开始初始化的8个channel对应的napi_structstmmac_enable_all_queues(priv); // 调用netif_tx_start_queue使net_device上的tx_queue生效stmmac_start_all_queues(priv);return 0;·······}
因为资源对于发送队列和接收队列的过程是不一样的,详细的就分开讲解一下:
2.发送过程
(1)资源分配
[1] alloc_dma_desc_resources -> alloc_dma_tx_desc_resources
static int alloc_dma_tx_desc_resources(struct stmmac_priv *priv){u32 tx_count = priv->plat->tx_queues_to_use;int ret = -ENOMEM;u32 queue;/* TX queues buffers and DMA */for (queue = 0; queue < tx_count; queue++) {struct stmmac_tx_queue *tx_q = &priv->tx_queue[queue];tx_q->queue_index = queue;tx_q->priv_data = priv; // 这里分配到的是虚拟地址// 直接分配DMA,512个dma_addr_ttx_q->tx_skbuff_dma = kmalloc_array(DMA_TX_SIZE, sizeof(*tx_q->tx_skbuff_dma), GFP_KERNEL);if (!tx_q->tx_skbuff_dma)goto err_dma; // 注意这里分配的是指针!tx_q->tx_skbuff = kmalloc_array(DMA_TX_SIZE,sizeof(struct sk_buff *),GFP_KERNEL);if (!tx_q->tx_skbuff)goto err_dma;if (priv->extend_desc) {tx_q->dma_etx = dma_zalloc_coherent(priv->device, DMA_TX_SIZE * sizeof(struct dma_extended_desc), &tx_q->dma_tx_phy, GFP_KERNEL);if (!tx_q->dma_etx)goto err_dma;} else { // 这个返回的是虚拟地址,同时将IO地址放在了tx_q->dma_tx_phy// 因为dma设备操作的是IO地址(本质上是IOMMU映射的内存物理地址),这个可以保证物理内存和缓存的一致性// 还有就是这里为什么要使用coherent// dma描述符是cpu和设备共同频繁操作的内存,生命周期很长,如果频繁的刷cache会导致性能下降// 也就是先用先分配的道理tx_q->dma_tx = dma_zalloc_coherent(priv->device, DMA_TX_SIZE * sizeof(struct dma_desc), &tx_q->dma_tx_phy, GFP_KERNEL);if (!tx_q->dma_tx)goto err_dma;}}return 0;······}
[2]init_dma_desc_rings->init_dma_tx_desc_rings
static int init_dma_tx_desc_rings(struct net_device *dev){struct stmmac_priv *priv = netdev_priv(dev);u32 tx_queue_cnt = priv->plat->tx_queues_to_use;u32 queue;int i;for (queue = 0; queue < tx_queue_cnt; queue++) {struct stmmac_tx_queue *tx_q = &priv->tx_queue[queue];netif_dbg(priv, probe, priv->dev, \"(%s) dma_tx_phy=0x%08x\\n\", __func__, (u32)tx_q->dma_tx_phy);/* Setup the chained descriptor addresses */// 只有CHAIN_MODE需要这样设置, 不用理会if (priv->mode == STMMAC_CHAIN_MODE) {if (priv->extend_desc)stmmac_mode_init(priv, tx_q->dma_etx,tx_q->dma_tx_phy, DMA_TX_SIZE, 1);elsestmmac_mode_init(priv, tx_q->dma_tx,tx_q->dma_tx_phy, DMA_TX_SIZE, 0);} // 注意这里和接收队列的处理很不同,接收队列会在此处就将所有的(sk_buff*)映射到dma_desc中for (i = 0; i < DMA_TX_SIZE; i++) {struct dma_desc *p;if (priv->extend_desc)p = &((tx_q->dma_etx + i)->basic);elsep = tx_q->dma_tx + i;stmmac_clear_desc(priv, p);tx_q->tx_skbuff_dma[i].buf = 0;tx_q->tx_skbuff_dma[i].map_as_page = false;tx_q->tx_skbuff_dma[i].len = 0;tx_q->tx_skbuff_dma[i].last_segment = false;tx_q->tx_skbuff[i] = NULL;}tx_q->dirty_tx = 0;tx_q->cur_tx = 0;tx_q->mss = 0;netdev_tx_reset_queue(netdev_get_tx_queue(priv->dev, queue));}return 0;}
[3] 数据结构
基本的初始化完毕了,现在就看初始化完是什么样子
(2)stmmac_xmit
注意到这个函数之前,都没有任何的中断参与,包括软中断,这还是系统空间
注意这里有个宏:看这个的时候我建议对照[3]数据结构中的图片去看
#define STMMAC_GET_ENTRY(x, size)((x + 1) & (size - 1))
static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev){struct stmmac_priv *priv = netdev_priv(dev);// skb的len 是线性数据长度 + 片段数据总长度// data_len 是片段数据总长度unsigned int nopaged_len = skb_headlen(skb);int i, csum_insertion = 0, is_jumbo = 0;u32 queue = skb_get_queue_mapping(skb);int nfrags = skb_shinfo(skb)->nr_frags;int entry;unsigned int first_entry;struct dma_desc *desc, *first;struct stmmac_tx_queue *tx_q;unsigned int enh_desc;unsigned int des;tx_q = &priv->tx_queue[queue]; // 不必关注if (priv->tx_path_in_lpi_mode)stmmac_disable_eee_mode(priv);/* Manage oversized TCP frames for GMAC4 device */if (skb_is_gso(skb) && priv->tso) {if (skb_shinfo(skb)->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))return stmmac_tso_xmit(skb, dev);} // 剩余的dma硬件描述符是否还够if (unlikely(stmmac_tx_avail(priv, queue) < nfrags + 1)) {if (!netif_tx_queue_stopped(netdev_get_tx_queue(dev, queue))) {netif_tx_stop_queue(netdev_get_tx_queue(priv->dev,queue));/* This is a hard error, log it. */netdev_err(priv->dev, \"%s: Tx Ring full when queue awake\\n\", __func__);}return NETDEV_TX_BUSY;} // 注意这个,指向的是第一个空闲的硬件描述符entry = tx_q->cur_tx; // 第一个并没有用到,这是为了后面存sk_buff用的first_entry = entry;WARN_ON(tx_q->tx_skbuff[first_entry]);csum_insertion = (skb->ip_summed == CHECKSUM_PARTIAL);if (likely(priv->extend_desc))desc = (struct dma_desc *)(tx_q->dma_etx + entry);elsedesc = tx_q->dma_tx + entry;first = desc; // 不必关注,xgmac才用到enh_desc = priv->plat->enh_desc;/* To program the descriptors according to the size of the frame */if (enh_desc)is_jumbo = stmmac_is_jumbo_frm(priv, skb->len, enh_desc);if (unlikely(is_jumbo)) {entry = stmmac_jumbo_frm(priv, tx_q, skb, csum_insertion);if (unlikely(entry < 0) && (entry != -EINVAL))goto dma_map_err;}// 当sk_buff的线性地址不够用的时候,会把这个帧分割到不同的frags里面,单独按页映射的for (i = 0; i < nfrags; i++) {const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];// 片长度int len = skb_frag_size(frag);bool last_segment = (i == (nfrags - 1)); // 对照上面的宏,这其实是一个遍历的过程, 注意!!!这里没用到第一个空闲的,而是跳过去了entry = STMMAC_GET_ENTRY(entry, DMA_TX_SIZE);WARN_ON(tx_q->tx_skbuff[entry]);if (likely(priv->extend_desc))desc = (struct dma_desc *)(tx_q->dma_etx + entry);elsedesc = tx_q->dma_tx + entry;// 将skb_frag数据映射到DMA中, 返回的是IO地址// 注意这个接口使用的是frag,不是buff,但返回的结果是一样的,是按页映射的des = skb_frag_dma_map(priv->device, frag, 0, len, DMA_TO_DEVICE);if (dma_mapping_error(priv->device, des))goto dma_map_err; /* should reuse desc w/o issues */tx_q->tx_skbuff_dma[entry].buf = des;// 给这个frag的IO地址放在了硬件描述符的desc0中stmmac_set_desc_addr(priv, desc, des);// 按页映射的意思tx_q->tx_skbuff_dma[entry].map_as_page = true;tx_q->tx_skbuff_dma[entry].len = len;// 最后一个frags标志tx_q->tx_skbuff_dma[entry].last_segment = last_segment;/* Prepare the descriptor and set the own bit too */// 这里面的len 是每个frag的sizestmmac_prepare_tx_desc(priv, desc, 0, len, csum_insertion,priv->mode, 1, last_segment, skb->len);}/* Only the last descriptor gets to point to the skb. */tx_q->tx_skbuff[entry] = skb;entry = STMMAC_GET_ENTRY(entry, DMA_TX_SIZE);tx_q->cur_tx = entry;······// 这个一般是16, 所以不用担心会超过512if (unlikely(stmmac_tx_avail(priv, queue) <= (MAX_SKB_FRAGS + 1))) {netif_dbg(priv, hw, priv->dev, \"%s: stop transmitted packets\\n\", __func__);netif_tx_stop_queue(netdev_get_tx_queue(priv->dev, queue));}dev->stats.tx_bytes += skb->len;tx_q->tx_count_frames += nfrags + 1; // 时戳,不必关注if (likely(priv->tx_coal_frames > tx_q->tx_count_frames) && !(priv->synopsys_id >= DWMAC_CORE_4_00 && (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP) && priv->hwts_tx_en)) {stmmac_tx_timer_arm(priv, queue);} else {tx_q->tx_count_frames = 0;stmmac_set_tx_ic(priv, desc);priv->xstats.tx_set_ic_bit++;}skb_tx_timestamp(skb);/* Ready to fill the first descriptor and set the OWN bit w/o any * problems because all the descriptors are actually ready to be * passed to the DMA engine. */if (likely(!is_jumbo)) {// 如果不是巨型帧,那这个sk_buff就是最后一个包bool last_segment = (nfrags == 0);des = dma_map_single(priv->device, skb->data, nopaged_len, DMA_TO_DEVICE);if (dma_mapping_error(priv->device, des))goto dma_map_err;tx_q->tx_skbuff_dma[first_entry].buf = des;stmmac_set_desc_addr(priv, first, des);tx_q->tx_skbuff_dma[first_entry].len = nopaged_len;tx_q->tx_skbuff_dma[first_entry].last_segment = last_segment;if (unlikely((skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP) && priv->hwts_tx_en)) {/* declare that device is doing timestamping */skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS;stmmac_enable_tx_timestamp(priv, first);}/* Prepare the first descriptor setting the OWN bit too */stmmac_prepare_tx_desc(priv, first, 1, nopaged_len,csum_insertion, priv->mode, 1, last_segment,skb->len);} else {stmmac_set_tx_owner(priv, first);}/* The own bit must be the latest setting done when prepare the * descriptor and then barrier is needed to make sure that * all is coherent before granting the DMA engine. */wmb();netdev_tx_sent_queue(netdev_get_tx_queue(dev, queue), skb->len);// dwmac4没这个函数stmmac_enable_dma_transmission(priv, priv->ioaddr);// 最后一个数据包的硬件描述符地址,没放在dma描述符中,直接写的tx_q->tx_tail_addr = tx_q->dma_tx_phy + (tx_q->cur_tx * sizeof(*desc));stmmac_set_tx_tail_ptr(priv, priv->ioaddr, tx_q->tx_tail_addr, queue);// 不要遗忘 在DMA发送完毕后,还会再一次触发硬中断, 那个时候会更新return NETDEV_TX_OK;}
[1] 内核的发送队列和DMA的发送队列
最开始的初始化过程中提到了,申请net_dev的时候也有8个接收和发送队列,而dma也要用到8个收发队列,这里是怎么联系的呢?
这样就完美的关联起来网卡和内核使用的对应的发送队列了,一般是这样写的
当xmit的时候,通过skb找到对应的net_device的发送队列编号,再用此编号控制网卡的发送队列
这样驱动就不必负责队列的调度,而内核的意思就是:
我负责调度,到哪个队列,即使你的队列和我的不一样,但你跟着编号就完事了
u32 queue = skb_get_queue_mapping(skb);static inline void skb_set_queue_mapping(struct sk_buff *skb, u16 queue_mapping){skb->queue_mapping = queue_mapping;}
这样最开始的发送队列选择完毕,下面我们就进入最关键的函数 stmmac_xmit()
[2] skbuff的排布
先补充一个前置知识,一个skbuff,是分为线性区(直接映射),和frag区(桉页映射)的,就像这样
[3] skbuff与dma的映射
可以看到这里是存在一个映射关系的,并没有复制sk_buff的步骤
这里有两点细节没有画出:
① sk_buff环放的是sk_buff,而DMA映射的地址是sk_buff->data
② dma_map_single是用于映射线性地址的,sk_buff->data这样映射。而dma_map_page是以页为单位映射的,frags[i]这样映射。
③ 并不是一个sk_buff对应一个sk_buff[i],而是在最后一个frags[i]的位置,驱动用的sk_buff[i] = sk_buff
映射完的效果就是这样:
具体这个环是怎么调整的可以看代码,这里不再给出
(3)总结
整体的流程就是:
- 内核选择发送队列
- stmmac_xmit
- 将sk_buff的frags[i]和data映射到DMA硬件描述符中
- 调整队列顺序
- 触发中断回收资源
还有最后一步,在DMA发送完毕的时候,还会触发一次DMA中断,也就是之前绑定的初始化函数stmmac_interrupt()
这个函数既负责了收包,又负责清理sk_buff,会放在最后讲
3.接收过程
(1)资源分配
[1] alloc_dma_desc_resources -> alloc_dma_rx_desc_resources
static int alloc_dma_rx_desc_resources(struct stmmac_priv *priv){// 这个是8u32 rx_count = priv->plat->rx_queues_to_use;int ret = -ENOMEM;u32 queue;/* RX queues buffers and DMA */for (queue = 0; queue < rx_count; queue++) {struct stmmac_rx_queue *rx_q = &priv->rx_queue[queue];rx_q->queue_index = queue;rx_q->priv_data = priv;// 这里分配到的是虚拟地址// 直接分配DMA,512个dma_addr_trx_q->rx_skbuff_dma = kmalloc_array(DMA_RX_SIZE, sizeof(dma_addr_t), GFP_KERNEL);if (!rx_q->rx_skbuff_dma)goto err_dma;rx_q->rx_skbuff = kmalloc_array(DMA_RX_SIZE,sizeof(struct sk_buff *),GFP_KERNEL);if (!rx_q->rx_skbuff)goto err_dma;if (priv->extend_desc) {// 这个返回的是虚拟地址,同时将IO地址放在了rx_q->dma_rx_phy// 因为dma设备操作的是IO地址(本质上是IOMMU映射的内存物理地址),这个可以保证物理内存和缓存的一致性// 还有就是这里为什么要使用coherent// dma描述符是cpu和设备共同频繁操作的内存,生命周期很长,如果频繁的刷cache会导致性能下降// 也就是先用先分配的道理rx_q->dma_erx = dma_zalloc_coherent(priv->device, DMA_RX_SIZE * sizeof(struct dma_extended_desc), &rx_q->dma_rx_phy, GFP_KERNEL);if (!rx_q->dma_erx)goto err_dma;} else {rx_q->dma_rx = dma_zalloc_coherent(priv->device, DMA_RX_SIZE * sizeof(struct dma_desc), &rx_q->dma_rx_phy, GFP_KERNEL);if (!rx_q->dma_rx)goto err_dma;}}return 0;}
[2]init_dma_desc_rings->init_dma_rx_desc_rings
static int init_dma_rx_desc_rings(struct net_device *dev, gfp_t flags){struct stmmac_priv *priv = netdev_priv(dev);u32 rx_count = priv->plat->rx_queues_to_use;int ret = -ENOMEM;int bfsize = 0;int queue;int i; // 最终的buf_size是根据设备树中的MTU来决定的bfsize = stmmac_set_16kib_bfsize(priv, dev->mtu);if (bfsize < 0)bfsize = 0;if (bfsize < BUF_SIZE_16KiB)bfsize = stmmac_set_bfsize(dev->mtu, priv->dma_buf_sz);priv->dma_buf_sz = bfsize;for (queue = 0; queue < rx_count; queue++) {struct stmmac_rx_queue *rx_q = &priv->rx_queue[queue];for (i = 0; i < DMA_RX_SIZE; i++) {struct dma_desc *p;if (priv->extend_desc)p = &((rx_q->dma_erx + i)->basic);elsep = rx_q->dma_rx + i; // 这里就是与tx队列的不同之处,在这里预分配了资源ret = stmmac_init_rx_buffers(priv, p, i, flags, queue);if (ret)goto err_init_rx_buffers;}rx_q->cur_rx = 0;rx_q->dirty_rx = (unsigned int)(i - DMA_RX_SIZE);stmmac_clear_rx_descriptors(priv, queue);}buf_sz = bfsize;return 0;······}
[3] stmmac_init_rx_buffers
上面的步骤和tx的大差不差,这个才是重量级
static int stmmac_init_rx_buffers(struct stmmac_priv *priv, struct dma_desc *p, int i, gfp_t flags, u32 queue){struct stmmac_rx_queue *rx_q = &priv->rx_queue[queue];struct sk_buff *skb; // 可以看到在这里就已经分配了skb,而且是常驻的skb = __netdev_alloc_skb_ip_align(priv->dev, priv->dma_buf_sz, flags);if (!skb) {netdev_err(priv->dev, \"%s: Rx init fails; skb is NULL\\n\", __func__);return -ENOMEM;}rx_q->rx_skbuff[i] = skb;// 流式映射, 返回的是设备可操作的IO地址,虚拟地址存储在skb->data中rx_q->rx_skbuff_dma[i] = dma_map_single(priv->device, skb->data,priv->dma_buf_sz,DMA_FROM_DEVICE);if (dma_mapping_error(priv->device, rx_q->rx_skbuff_dma[i])) {netdev_err(priv->dev, \"%s: DMA mapping error\\n\", __func__);dev_kfree_skb_any(skb);return -EINVAL;}// 主机字节序转换为小端字节序stmmac_set_desc_addr(priv, p, rx_q->rx_skbuff_dma[i]);// 这个只有xgmac才用if (priv->dma_buf_sz == BUF_SIZE_16KiB)stmmac_init_desc3(priv, p);return 0;}
[4] 数据结构
和发送比较,除去没有tx队列的控制信息外,最重要的是:
接收队列已经在初始化的过程内,分配了 512 * 8 个sk_buff,把映射完的data地址放进了DMA硬件描述符中
(2)stmmac_interrupt
static irqreturn_t stmmac_interrupt(int irq, void *dev_id){struct net_device *dev = (struct net_device *)dev_id;struct stmmac_priv *priv = netdev_priv(dev);u32 rx_cnt = priv->plat->rx_queues_to_use;u32 tx_cnt = priv->plat->tx_queues_to_use;u32 queues_count;u32 queue;bool xmac;························· // 前面啰嗦一堆,都是对意外情况的检查,因此我们直接来到真正的中断处理函数/* To handle DMA interrupts */stmmac_dma_interrupt(priv);return IRQ_HANDLED;}
(3)stmmac_dma_interrupt
static void stmmac_dma_interrupt(struct stmmac_priv *priv){u32 tx_channel_count = priv->plat->tx_queues_to_use;u32 rx_channel_count = priv->plat->rx_queues_to_use;u32 channels_to_check = tx_channel_count > rx_channel_count ?tx_channel_count : rx_channel_count;u32 chan;int status[max_t(u32, MTL_MAX_TX_QUEUES, MTL_MAX_RX_QUEUES)];/* Make sure we never check beyond our status buffer. */if (WARN_ON_ONCE(channels_to_check > ARRAY_SIZE(status)))channels_to_check = ARRAY_SIZE(status); // 这个其实就能理解channel的含义了,有8个DMA通道,8个接收队列,8个发送队列,共用这8个通道for (chan = 0; chan < channels_to_check; chan++)status[chan] = stmmac_napi_check(priv, chan); // 这部分不用看,也是出错处理,略过for (chan = 0; chan < tx_channel_count; chan++) {if (unlikely(status[chan] & tx_hard_error_bump_tc)) {/* Try to bump up the dma threshold on this failure */if (unlikely(priv->xstats.threshold != SF_DMA_MODE) && (tc <= 256)) {tc += 64;if (priv->plat->force_thresh_dma_mode)stmmac_set_dma_operation_mode(priv, tc, tc, chan);elsestmmac_set_dma_operation_mode(priv, tc, SF_DMA_MODE, chan);priv->xstats.threshold = tc;}} else if (unlikely(status[chan] == tx_hard_error)) {stmmac_tx_err(priv, chan);}}}
(4)stmmac_napi_check
static int stmmac_napi_check(struct stmmac_priv *priv, u32 chan){int status = stmmac_dma_interrupt_status(priv, priv->ioaddr, &priv->xstats, chan);struct stmmac_channel *ch = &priv->channel[chan];bool needs_work = false;if ((status & handle_rx) && ch->has_rx) {needs_work = true;} else {status &= ~handle_rx;}if ((status & handle_tx) && ch->has_tx) {needs_work = true;} else {status &= ~handle_tx;} // 基本都会进到这个函数里的,因为这个通道是tx和rx共用的if (needs_work && napi_schedule_prep(&ch->napi)) { // 先禁用中断,napi机制的存在就是为了减少中断次数stmmac_disable_dma_irq(priv, priv->ioaddr, chan); // napi_schedule最终就会触发在最开始初始化channel的时候,绑定的napi_struct // 也就是 stmmac_napi_poll函数__napi_schedule(&ch->napi);}return status;}
(5)stmmac_napi_poll
接下来来到最重量级的函数,这个函数既负责了接收操作,还负责了清理发送队列使用的资源
static int stmmac_napi_poll(struct napi_struct *napi, int budget){ // 注意这个budget,就是napi机制为每个poll函数准备的配额,poll函数只能执行这么多次struct stmmac_channel *ch =container_of(napi, struct stmmac_channel, napi);struct stmmac_priv *priv = ch->priv_data;int work_done, rx_done = 0, tx_done = 0;u32 chan = ch->index;priv->xstats.napi_poll++;if (ch->has_tx) // 清理发送队列的函数,返回值表示清理的个数tx_done = stmmac_tx_clean(priv, budget, chan);if (ch->has_rx) // 接收数据包的函数,返回值表示接收的个数rx_done = stmmac_rx(priv, budget, chan); // 只拿到最高的清理个数work_done = max(rx_done, tx_done); // 是否超过分配的配额work_done = min(work_done, budget); // 如果清理发送队列或接收数据包的次数中,最大的也小于budget // 就说明这时候还有空余,还可以再进入这个函数里面 // 这个已经使用的配额也会通过napi_complete_done返还给napi,请求重新调度if (work_done < budget && napi_complete_done(napi, work_done)) {int stat; // 这个最开始调用的时候对上了stmmac_enable_dma_irq(priv, priv->ioaddr, chan);stat = stmmac_dma_interrupt_status(priv, priv->ioaddr, &priv->xstats, chan); // 仔细体会这段代码,会发现这样写是将效率利用到了极致if (stat && napi_reschedule(napi))stmmac_disable_dma_irq(priv, priv->ioaddr, chan);}return work_done;}
(6)stmmac_rx
static int stmmac_rx(struct stmmac_priv *priv, int limit, u32 queue){struct stmmac_rx_queue *rx_q = &priv->rx_queue[queue];struct stmmac_channel *ch = &priv->channel[queue];unsigned int next_entry = rx_q->cur_rx;int coe = priv->hw->rx_csum;unsigned int count = 0;bool xmac;·······while (count < limit) {int entry, status;struct dma_desc *p;struct dma_desc *np;// 先保存rx_q->cur_rxentry = next_entry; // 出现过很多次的xgmac和xmac4的区别,不用理会if (priv->extend_desc)p = (struct dma_desc *)(rx_q->dma_erx + entry);else // 拿到cur_rx对应的dma硬件描述符p = rx_q->dma_rx + entry;/* read the status of the incoming frame */ // 这个函数是调用了底层的硬件操作函数,然后从DMA硬件中读取状态到priv->xstats中status = stmmac_rx_status(priv, &priv->dev->stats,&priv->xstats, p);/* check if managed by the DMA otherwise go ahead */ // 注意这个很重要,如果是dma_own就说明这个dma硬件描述符还被dma占用着 // 也就是还没接收完毕,当接收完毕这个标志位会置0if (unlikely(status & dma_own))break;count++; // 获取下一个dma硬件描述符,这个此时应该是空的rx_q->cur_rx = STMMAC_GET_ENTRY(rx_q->cur_rx, DMA_RX_SIZE);next_entry = rx_q->cur_rx;if (priv->extend_desc)np = (struct dma_desc *)(rx_q->dma_erx + next_entry);elsenp = rx_q->dma_rx + next_entry;prefetch(np);······出错判断,不用考虑······ struct sk_buff *skb; int frame_len; unsigned int des; // 取出dma硬件描述符,放到des里面 stmmac_get_desc_addr(priv, p, &des); // 获取接收数据长度 frame_len = stmmac_get_rx_frame_len(priv, p, coe); /* If frame length is greater than skb buffer size* (preallocated during init) then the packet is* ignored*/ // 这个就是MTU不一致,其实是会导致丢包的 if (frame_len > priv->dma_buf_sz) { if (net_ratelimit()) netdev_err(priv->dev, \"len %d larger than size (%d)\\n\", frame_len, priv->dma_buf_sz); priv->dev->stats.rx_length_errors++; continue; } /* The zero-copy is always used for all the sizes in case of GMAC4 because it needs * to refill the used descriptors, always. */ // 这段代码很有意思,在一些老的版本,例如mac10,100,1000中,是没有零拷贝的 if (unlikely(!xmac && ((frame_len < priv->rx_copybreak) || stmmac_rx_threshold_count(rx_q)))) { // 注意这里又分配了一次skb skb = netdev_alloc_skb_ip_align(priv->dev,frame_len);// 这个是刷新DMA缓存用的,确保dma中的缓存被刷新到CPU上了 // 因为DMA是绕过CPU的,也不参与缓存行,因此需要手动将DMA中的数据刷新到缓存中,确保CPU看到的是最新的 dma_sync_single_for_cpu(priv->device, rx_q->rx_skbuff_dma[entry], frame_len, DMA_FROM_DEVICE); // 拷贝数据到dma,可以看到这是一次拷贝,很费时间 skb_copy_to_linear_data(skb, rx_q->rx_skbuff[entry]->data, frame_len); skb_put(skb, frame_len); // 同理这个是反方向 dma_sync_single_for_device(priv->device, rx_q->rx_skbuff_dma[entry], frame_len, DMA_FROM_DEVICE);} else {skb = rx_q->rx_skbuff[entry];·······prefetch(skb->data - NET_IP_ALIGN);rx_q->rx_skbuff[entry] = NULL;rx_q->rx_zeroc_thresh++;skb_put(skb, frame_len); // 这个一步就代替了上面的三步,简单直接,所以是0拷贝dma_unmap_single(priv->device, rx_q->rx_skbuff_dma[entry], priv->dma_buf_sz, DMA_FROM_DEVICE);}// 获取时戳stmmac_get_rx_hwtstamp(priv, p, np, skb);// valn协议的特殊处理,不必关心stmmac_rx_vlan(priv->dev, skb); // 获取协议号skb->protocol = eth_type_trans(skb, priv->dev); // 是否校验和if (unlikely(!coe))skb_checksum_none_assert(skb);elseskb->ip_summed = CHECKSUM_UNNECESSARY; // skb已经准备好,真正的上层收包函数,至此linux内核已经接管了之前已经分配好的skbnapi_gro_receive(&ch->napi, skb); // 更新统计信息priv->dev->stats.rx_packets++;priv->dev->stats.rx_bytes += frame_len;}}stmmac_rx_refill(priv, queue);return count;}
我们统计一下这个函数的工作:
① 遍历 rx_dma的硬件描述符
② 找到dma已经接收完毕的硬件描述符
③ 解除dma映射,也就是将dma的数据刷新到cpu缓存行
④ 将skb部分字段填充,通过napi_gro_receive交给linux内核
⑤ 调用stmmac_rx_refill
(7)stmmac_rx_refill
我们回顾上面的总结:是不是很容易就能想到这个函数在干什么?
既然已经把skb交还回去了,这个函数自然还要重新来挂载新的skb
static inline void stmmac_rx_refill(struct stmmac_priv *priv, u32 queue){struct stmmac_rx_queue *rx_q = &priv->rx_queue[queue]; // 这里和tx是不一样的含义,这里面的dirty表示的是,没有挂载的skb的硬件dma描述符数量 // 而tx表示的是空余数量int dirty = stmmac_rx_dirty(priv, queue);unsigned int entry = rx_q->dirty_rx;int bfsize = priv->dma_buf_sz;while (dirty-- > 0) {struct dma_desc *p;if (priv->extend_desc)p = (struct dma_desc *)(rx_q->dma_erx + entry);elsep = rx_q->dma_rx + entry;if (likely(!rx_q->rx_skbuff[entry])) {struct sk_buff *skb; // 重新申请skb = netdev_alloc_skb_ip_align(priv->dev, bfsize);if (unlikely(!skb)) {/* so for a while no zero-copy! */rx_q->rx_zeroc_thresh = STMMAC_RX_THRESH;if (unlikely(net_ratelimit()))dev_err(priv->device,\"fail to alloc skb entry %d\\n\",entry);break;} // 挂载rx_q->rx_skbuff[entry] = skb; // 映射,完全一样的套路rx_q->rx_skbuff_dma[entry] = dma_map_single(priv->device, skb->data, bfsize, DMA_FROM_DEVICE);stmmac_set_desc_addr(priv, p, rx_q->rx_skbuff_dma[entry]);stmmac_refill_desc3(priv, rx_q, p); // 这个才是空余的接收队列if (rx_q->rx_zeroc_thresh > 0)rx_q->rx_zeroc_thresh--;}dma_wmb();// 将这个硬件描述符的使用权重新归还给dmastmmac_set_rx_owner(priv, p, priv->use_riwt);dma_wmb();//之前有讲过,继续遍历entry = STMMAC_GET_ENTRY(entry, DMA_RX_SIZE);} // 正常情况下,此时dirty_rx和cur_rx相等rx_q->dirty_rx = entry; // 这里不明白的点是,为什么要重写一遍?可能就和硬件相关了stmmac_set_rx_tail_ptr(priv, priv->ioaddr, rx_q->rx_tail_addr, queue);}
至此数据包的操作全部搞定,还剩最后一步:发送队列资源的回收。
4.发送队列资源回收
(1)stmmac_tx_clean
在 3.接收过程的(5)中可以看到,回收资源是这个函数
static int stmmac_tx_clean(struct stmmac_priv *priv, int budget, u32 queue){struct stmmac_tx_queue *tx_q = &priv->tx_queue[queue];unsigned int bytes_compl = 0, pkts_compl = 0;unsigned int entry, count = 0;__netif_tx_lock_bh(netdev_get_tx_queue(priv->dev, queue));priv->xstats.tx_clean++;entry = tx_q->dirty_tx;while ((entry != tx_q->cur_tx) && (count < budget)) {struct sk_buff *skb = tx_q->tx_skbuff[entry];struct dma_desc *p;int status;p = tx_q->dma_tx + entry;status = stmmac_tx_status(priv, &priv->dev->stats,&priv->xstats, p, priv->ioaddr);// 判断这个标志位其实就可以判断是否还对描述符操作,上面讲过/* Check if the descriptor is owned by the DMA */if (unlikely(status & tx_dma_own))break;count++;dma_rmb();/* Just consider the last segment and ...*/if (likely(!(status & tx_not_ls))) {/* ... verify the status error condition */if (unlikely(status & tx_err)) {priv->dev->stats.tx_errors++;} else {priv->dev->stats.tx_packets++;priv->xstats.tx_pkt_n++;}stmmac_get_tx_hwtstamp(priv, p, skb);}if (likely(tx_q->tx_skbuff_dma[entry].buf)) { // 对应了之前重点讲的,是frags[i]的映射,还是sk_buff.data的映射if (tx_q->tx_skbuff_dma[entry].map_as_page)dma_unmap_page(priv->device, tx_q->tx_skbuff_dma[entry].buf, tx_q->tx_skbuff_dma[entry].len, DMA_TO_DEVICE);else // 这个就是sk_buff.data的映射dma_unmap_single(priv->device, tx_q->tx_skbuff_dma[entry].buf, tx_q->tx_skbuff_dma[entry].len, DMA_TO_DEVICE); // 更新标志位tx_q->tx_skbuff_dma[entry].buf = 0;tx_q->tx_skbuff_dma[entry].len = 0;tx_q->tx_skbuff_dma[entry].map_as_page = false;}stmmac_clean_desc3(priv, tx_q, p);tx_q->tx_skbuff_dma[entry].last_segment = false;tx_q->tx_skbuff_dma[entry].is_jumbo = false; // 统计数据if (likely(skb != NULL)) {pkts_compl++;bytes_compl += skb->len;dev_consume_skb_any(skb);tx_q->tx_skbuff[entry] = NULL;}// 清空硬件dma描述符信息stmmac_release_tx_desc(priv, p, priv->mode);entry = STMMAC_GET_ENTRY(entry, DMA_TX_SIZE);} // 这个dirty_tx可以仔细研究一下,其实就是环形队列操作tx_q->dirty_tx = entry; // 更新net_device上的队列信息,这个队列是内核操作的netdev_tx_completed_queue(netdev_get_tx_queue(priv->dev, queue), pkts_compl, bytes_compl); // 告知内核已经发送完毕,应该停止发送了if (unlikely(netif_tx_queue_stopped(netdev_get_tx_queue(priv->dev,queue))) && stmmac_tx_avail(priv, queue) > STMMAC_TX_THRESH) {netif_dbg(priv, tx_done, priv->dev, \"%s: restart transmit\\n\", __func__);netif_tx_wake_queue(netdev_get_tx_queue(priv->dev, queue));}__netif_tx_unlock_bh(netdev_get_tx_queue(priv->dev, queue));return count;}