Linux Device Driver Part IV. PCI Device

文中代码基于Linux 5.1 rc6版本

MODULE_DEVICE_TABLE

在讨论PCI驱动之前,我们先考察一下PCI驱动的Kernel Module是如何被自动加载的。首先,每个PCI Function有5个(或3个)Configuration寄存器,即Device ID、Vendor ID、Class Code(由Class、Subclass、Interface三个Byte构成)以及Subsystem ID、Subsystem Vendor ID(前三个必选,后两个可选),在Linux Kernel中用pci_device_id表示:

1
2
3
4
5
6
struct pci_device_id {
__u32 vendor, device; /* Vendor and device ID or PCI_ANY_ID*/
__u32 subvendor, subdevice; /* Subsystem ID's or PCI_ANY_ID */
__u32 class, class_mask; /* (class,subclass,prog-if) triplet */
kernel_ulong_t driver_data; /* Data private to the driver */
};

每个非内置的PCI驱动,都要声明一个pci_device_id列表(以数组的形式),用于表示其支持的PCI设备,我们假设列表为id_tbl,则使用宏MODULE_DEVICE_TABLE(pci, id_tbl)声明该列表,这个信息最终会编译到.ko文件中。我们来看一下具体过程:

首先,通过MODULE_DEVICE_TABLE宏,我们声明了id_tbl的一个别名__mod_pci__id_tbl_device_table

1
2
3
4
/* Creates an alias so file2alias.c can find device table. */
#define MODULE_DEVICE_TABLE(type, name) \
extern typeof(name) __mod_##type##__##name##_device_table \
__attribute__ ((unused, alias(__stringify(name))))

然后,在编译过程的modpost(Module Postprocessing)这一步,使用modpost生成<modname>.mod.c时,会读取并解析<modname>.o文件,当遇到形如__mod_##type##__##name##_device_table的变量名时,就会进行特别处理,根据变量name的值向<modname>.mod.c中输出相应MODULE_ALIAS语句,相关代码位于scripts/mod/file2alias.c

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
static void do_table(void *symval, unsigned long size,
unsigned long id_size,
const char *device_id,
int (*do_entry)(const char *filename, void *symval, char *alias),
struct module *mod)
{
unsigned int i;
char alias[ALIAS_SIZE];

device_id_check(mod->name, device_id, size, id_size, symval);
/* Leave last one: it's the terminator. */
size -= id_size;

for (i = 0; i < size; i += id_size) {
if (do_entry(mod->name, symval+i, alias)) {
buf_printf(&mod->dev_table_buf,
"MODULE_ALIAS(\"%s\");\n", alias);
}
}
}

MODULE_ALIAS的效果就是向最终的.ko文件中的.modinfo段添加了变量。最终,这个变量可以被modinfo程序读出:

1
2
3
4
5
6
7
8
9
10
#define __MODULE_INFO(tag, name, info)                              \
static const char __UNIQUE_ID(name)[] \
__used __attribute__((section(".modinfo"), unused, aligned(1))) \
= __stringify(tag) "=" info

/* Generic info of form tag = "info" */
#define MODULE_INFO(tag, info) __MODULE_INFO(tag, tag, info)

/* For userspace: you can also call me... */
#define MODULE_ALIAS(_alias) MODULE_INFO(alias, _alias)

例如下面这个模块,其alias为pci:v00001022d0000746Bsv*sd*bc*sc*i*,实际上PCI的alias格式均为pci:vNdNsvNsdNbcNscNiN,其中v表示Vendor ID,d表示Device ID,sv表示Subsysem Vendor ID,sd表示Subsystem ID,c表示Class,sc表示Subclass,i表示Interface:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
~$ modinfo gpio-amd8111
filename: /lib/modules/4.15.0-48-generic/kernel/drivers/gpio/gpio-amd8111.ko
license: GPL
description: GPIO driver for AMD chipsets
author: The Linux Kernel team
srcversion: 589A6155F066F900B920218
alias: pci:v00001022d0000746Bsv*sd*bc*sc*i*
depends:
retpoline: Y
intree: Y
name: gpio_amd8111
vermagic: 4.15.0-48-generic SMP mod_unload
signat: PKCS#7
signer:
sig_key:
sig_hashalgo: md4

make module_install时,最后一步是运行depmod程序,它会根据.ko文件中.modinfo段的内容,生成modules.aliasmodules.alias.bin文件(通常位于/lib/modules/$(uname -r)/,如果改变了modules的安装路径那么生成的文件也会放到安装路径),其中收集了所有Kernel Module的alias信息。例如上面的gpio-amd8111,在modules.alias中对应的条目如下:

1
alias pci:v00001022d0000746Bsv*sd*bc*sc*i* gpio_amd8111

在内核启动时,会遍历PCI总线来枚举PCI设备,对每个检测到的设备都会在Device Model中创建一个Device,其中有一个属性就是modalias,它的格式也是pci:vNdNsvNsdNbcNscNiN,这样一来,udev就可以根据这个modalias属性到modules.alias中查找对应的Kernel Module,然后加载设备对应的Module,如此便实现了PCI驱动的自动加载。modalias的实现代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
static ssize_t modalias_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
struct pci_dev *pci_dev = to_pci_dev(dev);

return sprintf(buf, "pci:v%08Xd%08Xsv%08Xsd%08Xbc%02Xsc%02Xi%02X\n",
pci_dev->vendor, pci_dev->device,
pci_dev->subsystem_vendor, pci_dev->subsystem_device,
(u8)(pci_dev->class >> 16), (u8)(pci_dev->class >> 8),
(u8)(pci_dev->class));
}
static DEVICE_ATTR_RO(modalias);

PCI Device Enumerating

Essential Structures

上面说到,内核启动时会枚举PCI总线上的所有设备,对每个检测到的设备都会创建一个struct pci_dev,其中嵌入了一个struct device

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
/* The pci_dev structure describes PCI devices */
struct pci_dev {
struct list_head bus_list; /* Node in per-bus list */
struct pci_bus *bus; /* Bus this device is on */
struct pci_bus *subordinate; /* Bus this device bridges to */

void *sysdata; /* Hook for sys-specific extension */
struct proc_dir_entry *procent; /* Device entry in /proc/bus/pci */
struct pci_slot *slot; /* Physical slot this device is in */

unsigned int devfn; /* Encoded device & function index */
unsigned short vendor;
unsigned short device;
unsigned short subsystem_vendor;
unsigned short subsystem_device;
unsigned int class; /* 3 bytes: (base,sub,prog-if) */
u8 revision; /* PCI revision, low byte of class word */
u8 hdr_type; /* PCI header type (`multi' flag masked out) */

/* ... */

struct pci_driver *driver; /* Driver bound to this device */

/* ... */

pci_channel_state_t error_state; /* Current connectivity state */
struct device dev; /* Generic device interface */

int cfg_size; /* Size of config space */

/*
* Instead of touching interrupt line and base address registers
* directly, use the values stored here. They might be different!
*/
unsigned int irq;
struct resource resource[DEVICE_COUNT_RESOURCE]; /* I/O and memory regions + expansion ROMs */

bool match_driver; /* Skip attaching driver */

/* ... */

pci_dev_flags_t dev_flags;
atomic_t enable_cnt; /* pci_enable_device has been called */

u32 saved_config_space[16]; /* Config space saved at suspend time */
struct hlist_head saved_cap_space;
struct bin_attribute *rom_attr; /* Attribute descriptor for sysfs ROM entry */
int rom_attr_enabled; /* Display of ROM attribute enabled? */
struct bin_attribute *res_attr[DEVICE_COUNT_RESOURCE]; /* sysfs file for resources */
struct bin_attribute *res_attr_wc[DEVICE_COUNT_RESOURCE]; /* sysfs file for WC mapping of resources */

/* ... */

phys_addr_t rom; /* Physical address if not from BAR */
size_t romlen; /* Length if not from BAR */
char *driver_override; /* Driver name to force a match */

unsigned long priv_flags; /* Private flags for the PCI driver */
};

这个pci_dev对象,就对应于类似/sys/devices/pciNNNN:NN/NNNN:NN:NN.N/这样的目录,udev可以利用其中的modalias属性文件动态加载驱动模块。

每个pci_dev都有一个pci_driver类型的成员,即它的驱动,它们是在probe时配对的,关于probe下一节再介绍。我们可以发现,每个pci_dev都属于一个pci_bus,实际上它会挂在pci_bus->devices链表下。由于设备本身可能是一个Bridge,它下面可能还有一个Secondary Bus,即pci_dev->subordinate。需要指出的是,pci_bus实际上也是一个Device,而不是Bus对象。

pci_bus的定义如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
struct pci_bus {
struct list_head node; /* Node in list of buses */
struct pci_bus *parent; /* Parent bus this bridge is on */
struct list_head children; /* List of child buses */
struct list_head devices; /* List of devices on this bus */
struct pci_dev *self; /* Bridge device as seen by parent */
struct list_head slots; /* List of slots on this bus;
protected by pci_slot_mutex */
struct resource *resource[PCI_BRIDGE_RESOURCE_NUM];
struct list_head resources; /* Address space routed to this bus */
struct resource busn_res; /* Bus numbers routed to this bus */

struct pci_ops *ops; /* Configuration access functions */
struct msi_controller *msi; /* MSI controller */
void *sysdata; /* Hook for sys-specific extension */
struct proc_dir_entry *procdir; /* Directory entry in /proc/bus/pci */

unsigned char number; /* Bus number */
unsigned char primary; /* Number of primary bridge */
unsigned char max_bus_speed; /* enum pci_bus_speed */
unsigned char cur_bus_speed; /* enum pci_bus_speed */
#ifdef CONFIG_PCI_DOMAINS_GENERIC
int domain_nr;
#endif

char name[48];

unsigned short bridge_ctl; /* Manage NO_ISA/FBB/et al behaviors */
pci_bus_flags_t bus_flags; /* Inherited by child buses */
struct device *bridge;
struct device dev;
struct bin_attribute *legacy_io; /* Legacy I/O for this bus */
struct bin_attribute *legacy_mem; /* Legacy mem */
unsigned int is_added:1;
};

每个pci_bus可以有一个parent,还可以有一个children链表,每个PCI Domain(即PCI Segment Group)可以构成一棵pci_bus树,即Host Bridge或Root Complex下辖的总线树,所有根节点的pci_bus对象都存放在pci_root_buses链表中:

1
2
3
/* Do NOT directly access these two variables, unless you are arch-specific PCI
* code, or PCI core code. */
extern struct list_head pci_root_buses; /* List of all known PCI buses */

对于每个根节点,都有一个对应的pci_host_bridge

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
struct pci_host_bridge {
struct device dev;
struct pci_bus *bus; /* Root bus */
struct pci_ops *ops;
void *sysdata;
int busnr;
struct list_head windows; /* resource_entry */
u8 (*swizzle_irq)(struct pci_dev *, u8 *);
int (*map_irq)(const struct pci_dev *, u8, u8);
void (*release_fn)(struct pci_host_bridge *);
void *release_data;
struct msi_controller *msi;
unsigned int ignore_reset_delay:1; /* For entire hierarchy */
unsigned int no_ext_tags:1; /* No Extended Tags */
unsigned int native_aer:1; /* OS may use PCIe AER */
unsigned int native_pcie_hotplug:1; /* OS may use PCIe hotplug */
unsigned int native_shpc_hotplug:1; /* OS may use SHPC hotplug */
unsigned int native_pme:1; /* OS may use PCIe PME */
unsigned int native_ltr:1; /* OS may use PCIe LTR */
/* Resource alignment requirements */
resource_size_t (*align_resource)(struct pci_dev *dev,
const struct resource *res,
resource_size_t start,
resource_size_t size,
resource_size_t align);
unsigned long private[0] ____cacheline_aligned;
};

Bus Scanning Overview

有了以上的基本概念以后,我们就可以开始考察启动时的PCI设备枚举过程了。在内核中,扫描PCI总线的过程可以分为四个步骤:

  1. pci_alloc_host_bridge(),创建pci_host_bridge对象
  2. pci_register_host_bridge(),将pci_host_bridge注册到sysfs中,并创建、注册root bus的pci_bus对象
  3. pci_scan_child_bus(),从root bus开始递归地扫描总线,最终会在pci_scan_single_device中为每个设备创建、初始化并注册pci_dev对象
  4. pci_bus_add_devices(),将device和driver绑定起来,这一步会调用到pci_driver的probe回调函数

内核中,不同体系结构或同一体系结构但不同的具体配置,启动时触发PCI设备枚举的Code Path都不同。

对于不支持ACPI的平台,通常Host Bridge本身会作为一个Platform Device进行实现,在其probe函数中会调用pci_alloc_host_bridge创建pci_host_bridge对象,然后进行初始化,随后再调用pci_scan_root_bus_bridge,它会进行上述第2、3步,最后再调用pci_bus_add_devices激活这些设备的驱动即可。有时,也可以使用pci_host_probe,该函数会依次调用pci_scan_root_bus_bridgepci_bus_add_devices

此外,有少数情况下,会调用pci_scan_root_bus,该函数会依次进行上述第1-3步,然后返回一个pci_bus对象(即root bus),这适用于不需要获取和设置pci_host_bridge对象的情况。此类情形中最具代表性的就是x86平台在不适用ACPI的情况下,会调用pci_scan_root_bus进行PCI设备枚举,其调用链如下:

1
2
3
4
5
6
7
8
subsys_initcall(pci_subsys_init)
--> x86_init.pci.init() ==> pci_legacy_init()
--> pci_bios_scan_root(0)
--> pci_scan_root_bus(NULL, busnum, &pci_root_ops, sd, &resources)
--> pcibios_fixup_peer_bridges()
--> pcibios_scan_specific_bus(n) /* 仅当CONFIG_PCI_BIOS = y时被调用,64位下CONFIG_PCI_BIOS = n */
--> pci_bios_scan_root(busn)
--> pci_scan_root_bus(NULL, busnum, &pci_root_ops, sd, &resources)

CONFIG_PCI_BIOS = y表示使用PCI BIOS进行PCI Configuration Space的访问等操作,CONFIG_PCI_DIRECT = y表示绕过BIOS由Linux直接访问PCI总线。由于在64位模式下不能调用BIOS服务,故64位内核只支持CONFIG_PCI_DIRECT = y

当然,对于x86平台来说另一种PCI初始化方式就是ACPI(此外ARM服务器也支持ACPI),此时的调用链为:

1
2
3
4
5
6
7
8
9
10
11
subsys_initcall(acpi_init)
--> acpi_scan_init()
--> acpi_bus_scan(ACPI_ROOT_OBJECT)
--> acpi_bus_attach(device) --> acpi_bus_attach(child) --> ... --> acpi_bus_attach(child)
--> acpi_scan_attach_handler(device)
--> handler->attach(device, devid) ==> acpi_pci_root_add(device, devid)
--> pci_acpi_scan_root(root)
--> acpi_pci_root_create(root, &acpi_pci_root_ops, &info->common, &info->sd)
--> pci_create_root_bus(NULL, busnum, ops->pci_ops, sysdata, &info->resources)
--> pci_scan_child_bus(bus)
--> pci_bus_add_devices(root->bus)

这里使用了pci_create_root_bus函数实现了上述第1、2步,其余没什么特殊的。当然,由于ACPI支持热插拔,所以实际上在任意时刻都有可能因热插拔而触发acpi_bus_scan并最终引起PCI总线扫描。

Resource Management

在内核中有一个resource数据结构,用于IO端口、MMIO内存区域等资源的分配、管理:

1
2
3
4
5
6
7
8
9
10
11
12
/*
* Resources are tree-like, allowing
* nesting etc..
*/
struct resource {
resource_size_t start;
resource_size_t end;
const char *name;
unsigned long flags;
unsigned long desc;
struct resource *parent, *sibling, *child;
};

可以看到,resource是以树的形式组织的,并且内核中实际上不止一棵树,例如在kernel/resource.c中就定义了Port IO和MMIO两种资源的根节点:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
struct resource ioport_resource = {
.name = "PCI IO",
.start = 0,
.end = IO_SPACE_LIMIT,
.flags = IORESOURCE_IO,
};
EXPORT_SYMBOL(ioport_resource);

struct resource iomem_resource = {
.name = "PCI mem",
.start = 0,
.end = -1,
.flags = IORESOURCE_MEM,
};
EXPORT_SYMBOL(iomem_resource);

我们可以使用request_resource(root, new)向root添加一个resource,或者用__request_region(root, start, n, name, flags)让内核帮我们构造resource然后添加到root。resource一共有五种:

1
2
3
4
5
6
7
#define IORESOURCE_TYPE_BITS    0x00001f00  /* Resource type */
#define IORESOURCE_IO 0x00000100 /* PCI/ISA I/O ports */
#define IORESOURCE_MEM 0x00000200
#define IORESOURCE_REG 0x00000300 /* Register offsets */
#define IORESOURCE_IRQ 0x00000400
#define IORESOURCE_DMA 0x00000800
#define IORESOURCE_BUS 0x00001000

即Port IO、MMIO、IRQ、DMA和Bus,其中Port IO和MMIO还有专门的辅助函数request_region(start, n, name)(Port IO)和request_mem_region(start, n, name)(MMIO)。

上述函数还有对应的devm_引用计数版本,当struct device的引用计数归零时会自动release注册的resource,例如为一个设备申请IO端口可以使用devm_request_region(dev, start, n, name)

pci_register_host_bridge

我们首先考察pci_register_host_bridge(bridge)

第一,它注册了bridge对应的Device,其目录为/sys/devices/pciNNNN:NN,NNNN:NN表示Domain Number、Bus Number:

1
2
3
4
5
6
7
8
9
10
dev_set_name(&bridge->dev, "pci%04x:%02x", pci_domain_nr(bus),
bridge->busnr);

err = pcibios_root_bridge_prepare(bridge);
if (err)
goto free;

err = device_register(&bridge->dev);
if (err)
put_device(&bridge->dev);

第二,创建了根总线的pci_bus对象,其成员依据host bridge设置:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
bus = pci_alloc_bus(NULL);
if (!bus)
return -ENOMEM;

bridge->bus = bus;

/* Temporarily move resources off the list */
list_splice_init(&bridge->windows, &resources);
bus->sysdata = bridge->sysdata;
bus->msi = bridge->msi;
bus->ops = bridge->ops;
bus->number = bus->busn_res.start = bridge->busnr;
#ifdef CONFIG_PCI_DOMAINS_GENERIC
bus->domain_nr = pci_bus_find_domain_nr(bus, parent);
#endif

第三,将根总线的Device注册到sysfs中:

1
2
3
4
5
6
7
8
9
bus->dev.class = &pcibus_class;
bus->dev.parent = bus->bridge;

dev_set_name(&bus->dev, "%04x:%02x", pci_domain_nr(bus), bus->number);
name = dev_name(&bus->dev);

err = device_register(&bus->dev);
if (err)
goto unregister;

从中我们可以发现pci_bus实际上仍是Device而不是Bus,它们属于pcibus_class这个Class,它们对应的目录应该位于/sys/devices/pciNNNN:NN/pci_bus/XXXX:XX/.../ZZZZ:ZZ

pcibus_class提供了三个默认属性rescan, cpuaffinity, cpulistaffinity

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
static struct attribute *pcibus_attrs[] = {
&dev_attr_rescan.attr,
&dev_attr_cpuaffinity.attr,
&dev_attr_cpulistaffinity.attr,
NULL,
};

static const struct attribute_group pcibus_group = {
.attrs = pcibus_attrs,
};

const struct attribute_group *pcibus_groups[] = {
&pcibus_group,
NULL,
};

static struct class pcibus_class = {
.name = "pci_bus",
.dev_release = &release_pcibus_dev,
.dev_groups = pcibus_groups,
};

这其中rescan可写而后两者只读,向rescan写入非零值即可重新扫描该bus:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
static ssize_t dev_bus_rescan_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
unsigned long val;
struct pci_bus *bus = to_pci_bus(dev);

if (kstrtoul(buf, 0, &val) < 0)
return -EINVAL;

if (val) {
pci_lock_rescan_remove();
if (!pci_is_root_bus(bus) && list_empty(&bus->devices))
pci_rescan_bus_bridge_resize(bus->self);
else
pci_rescan_bus(bus);
pci_unlock_rescan_remove();
}
return count;
}
static DEVICE_ATTR(rescan, (S_IWUSR|S_IWGRP), NULL, dev_bus_rescan_store);

unsigned int pci_rescan_bus_bridge_resize(struct pci_dev *bridge)
{
unsigned int max;
struct pci_bus *bus = bridge->subordinate;

max = pci_scan_child_bus(bus);

pci_assign_unassigned_bridge_resources(bridge);

pci_bus_add_devices(bus);

return max;
}

unsigned int pci_rescan_bus(struct pci_bus *bus)
{
unsigned int max;

max = pci_scan_child_bus(bus);
pci_assign_unassigned_bus_resources(bus);
pci_bus_add_devices(bus);

return max;
}

第四,为根总线设置了resource:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
/* Add initial resources to the bus */
resource_list_for_each_entry_safe(window, n, &resources) {
list_move_tail(&window->node, &bridge->windows);
offset = window->offset;
res = window->res;

if (res->flags & IORESOURCE_BUS)
pci_bus_insert_busn_res(bus, bus->number, res->end);
else
pci_bus_add_resource(bus, res, 0);

if (offset) {
if (resource_type(res) == IORESOURCE_IO)
fmt = " (bus address [%#06llx-%#06llx])";
else
fmt = " (bus address [%#010llx-%#010llx])";

snprintf(addr, sizeof(addr), fmt,
(unsigned long long)(res->start - offset),
(unsigned long long)(res->end - offset));
} else
addr[0] = '\0';

dev_info(&bus->dev, "root bus resource %pR%s\n", res, addr);
}

这里的添加到根总线的resource来自bridge->windows,在pci_register_host_bridge之前会由调用者设置好,例如在x86的legacy模式下,是在x86_pci_root_bus_resource中设置好了resource链表,然后作为最后一个参数传给pci_scan_root_bus

回顾一下pci_bus的定义:

1
2
3
4
5
6
7
8
9
struct pci_bus {
/* ... */

struct resource *resource[PCI_BRIDGE_RESOURCE_NUM];
struct list_head resources; /* Address space routed to this bus */
struct resource busn_res; /* Bus numbers routed to this bus */

/* ... */
};

上文的pci_bus_insert_busn_res是向内核注册了类型为IORESOURCE_BUS的resource,并将其赋值给了bus->busn_respci_bus_add_resource则是将resource添加到了bus->resources链表中,没有注册resource这一步,因为非bus类型的resource在之前申请时已经注册过。

pci_scan_child_bus

现在再来考察pci_scan_child_bus

它的第一步是探测Device:Function从0到255的所有设备:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
/* Go find them, Rover! */
for (devfn = 0; devfn < 256; devfn += 8) {
nr_devs = pci_scan_slot(bus, devfn);

/*
* The Jailhouse hypervisor may pass individual functions of a
* multi-function device to a guest without passing function 0.
* Look for them as well.
*/
if (jailhouse_paravirt() && nr_devs == 0) {
for (fn = 1; fn < 8; fn++) {
dev = pci_scan_single_device(bus, devfn + fn);
if (dev)
dev->multifunction = 1;
}
}
}

这一步完成后,探测到的设备都建立了pci_dev对象,并加入到了bus->devices链表中,此时遍历该链表找到所有Bridge设备,进行递归扫描:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
/*
* Scan bridges that are already configured. We don't touch them
* unless they are misconfigured (which will be done in the second
* scan below).
*/
for_each_pci_bridge(dev, bus) {
cmax = max;
max = pci_scan_bridge_extend(bus, dev, max, 0, 0);

/*
* Reserve one bus for each bridge now to avoid extending
* hotplug bridges too much during the second scan below.
*/
used_buses++;
if (cmax - max > 1)
used_buses += cmax - max - 1;
}

/* Scan bridges that need to be reconfigured */
for_each_pci_bridge(dev, bus) {
unsigned int buses = 0;

if (!hotplug_bridges && normal_bridges == 1) {

/*
* There is only one bridge on the bus (upstream
* port) so it gets all available buses which it
* can then distribute to the possible hotplug
* bridges below.
*/
buses = available_buses;
} else if (dev->is_hotplug_bridge) {

/*
* Distribute the extra buses between hotplug
* bridges if any.
*/
buses = available_buses / hotplug_bridges;
buses = min(buses, available_buses - used_buses + 1);
}

cmax = max;
max = pci_scan_bridge_extend(bus, dev, cmax, buses, 1);
/* One bus is already accounted so don't add it again */
if (max - cmax > 1)
used_buses += max - cmax - 1;
}

我们实际上需要扫描两遍,第一遍处理BIOS已经配置过的Bridge,第二遍处理未配置的Bridge,分别对应于pci_scan_bridge_extend最后一个参数为0和1。


第一步中的pci_scan_slot的调用链如下:

1
2
3
4
5
6
pci_scan_slot(bus, devfn)
--> pci_scan_single_device(bus, devfn)
--> pci_scan_device(bus, devfn)
--> pci_alloc_dev(bus)
--> pci_setup_device(dev)
--> pci_device_add(dev, bus)

pci_alloc_dev中,为新创建的pci_dev对象设置的device_typepci_dev_type,定义在drivers/pci/pci-sysfs.c中,我们可以在其中查阅到PCI Device的各种属性的定义,这里就不详细深入了。

pci_setup_device中会对刚才创建的pci_dev对象进行初始化,这包括了利用pci_read_config_[byte|word|dword]读取Configuration Space中的信息来完成初始化。先看函数中的这个片段:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
hdr_type = pci_hdr_type(dev);

dev->sysdata = dev->bus->sysdata;
dev->dev.parent = dev->bus->bridge;
dev->dev.bus = &pci_bus_type;
dev->hdr_type = hdr_type & 0x7f;
dev->multifunction = !!(hdr_type & 0x80);
dev->error_state = pci_channel_io_normal;
set_pcie_port_type(dev);

/* ... */

dev_set_name(&dev->dev, "%04x:%02x:%02x.%d", pci_domain_nr(dev->bus),
dev->bus->number, PCI_SLOT(dev->devfn),
PCI_FUNC(dev->devfn));

这表明PCI设备的真正的Bus定义在pci_bus_type变量中,PCI设备的目录为/sys/devices/<bridge path>/NNNN:NN:NN.N(Domain:Bus:Device.Function)。pci_bus_type中为PCI Device定义了许多默认属性:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
static struct attribute *pci_dev_attrs[] = {
&dev_attr_resource.attr,
&dev_attr_vendor.attr,
&dev_attr_device.attr,
&dev_attr_subsystem_vendor.attr,
&dev_attr_subsystem_device.attr,
&dev_attr_revision.attr,
&dev_attr_class.attr,
&dev_attr_irq.attr,
&dev_attr_local_cpus.attr,
&dev_attr_local_cpulist.attr,
&dev_attr_modalias.attr,
#ifdef CONFIG_NUMA
&dev_attr_numa_node.attr,
#endif
&dev_attr_dma_mask_bits.attr,
&dev_attr_consistent_dma_mask_bits.attr,
&dev_attr_enable.attr,
&dev_attr_broken_parity_status.attr,
&dev_attr_msi_bus.attr,
#if defined(CONFIG_PM) && defined(CONFIG_ACPI)
&dev_attr_d3cold_allowed.attr,
#endif
#ifdef CONFIG_OF
&dev_attr_devspec.attr,
#endif
&dev_attr_driver_override.attr,
&dev_attr_ari_enabled.attr,
NULL,
};

上文提到的modalias也在其中。而它为自己(即/sys/bus/pci)只定义了一个rescan属性:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
static ssize_t rescan_store(struct bus_type *bus, const char *buf, size_t count)
{
unsigned long val;
struct pci_bus *b = NULL;

if (kstrtoul(buf, 0, &val) < 0)
return -EINVAL;

if (val) {
pci_lock_rescan_remove();
while ((b = pci_find_next_bus(b)) != NULL)
pci_rescan_bus(b);
pci_unlock_rescan_remove();
}
return count;
}
static BUS_ATTR_WO(rescan);

再来看下面这个switch,它根据三种Configuration Space的Header Type,分别设置了pci_dev的一些field:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
switch (dev->hdr_type) {                    /* header type */
case PCI_HEADER_TYPE_NORMAL: /* standard header */
if (class == PCI_CLASS_BRIDGE_PCI)
goto bad;
pci_read_irq(dev);
pci_read_bases(dev, 6, PCI_ROM_ADDRESS);

pci_subsystem_ids(dev, &dev->subsystem_vendor, &dev->subsystem_device);
/* ... */
break;

case PCI_HEADER_TYPE_BRIDGE: /* bridge header */
/*
* The PCI-to-PCI bridge spec requires that subtractive
* decoding (i.e. transparent) bridge must have programming
* interface code of 0x01.
*/
pci_read_irq(dev);
dev->transparent = ((dev->class & 0xff) == 1);
pci_read_bases(dev, 2, PCI_ROM_ADDRESS1);
pci_read_bridge_windows(dev);
set_pcie_hotplug_bridge(dev);
pos = pci_find_capability(dev, PCI_CAP_ID_SSVID);
if (pos) {
pci_read_config_word(dev, pos + PCI_SSVID_VENDOR_ID, &dev->subsystem_vendor);
pci_read_config_word(dev, pos + PCI_SSVID_DEVICE_ID, &dev->subsystem_device);
}
break;

case PCI_HEADER_TYPE_CARDBUS: /* CardBus bridge header */
if (class != PCI_CLASS_BRIDGE_CARDBUS)
goto bad;
pci_read_irq(dev);
pci_read_bases(dev, 1, 0);
pci_read_config_word(dev, PCI_CB_SUBSYSTEM_VENDOR_ID, &dev->subsystem_vendor);
pci_read_config_word(dev, PCI_CB_SUBSYSTEM_ID, &dev->subsystem_device);
break;

default: /* unknown header */
pci_err(dev, "unknown header type %02x, ignoring device\n",
dev->hdr_type);
return -EIO;

bad:
pci_err(dev, "ignoring class %#08x (doesn't match header type %02x)\n",
dev->class, dev->hdr_type);
dev->class = PCI_CLASS_NOT_DEFINED << 8;
}

回顾pci_dev的定义:

1
2
3
4
5
6
7
8
9
10
11
12
13
struct pci_dev {
u8 pin; /* Interrupt pin this device uses */
/* ... */

/*
* Instead of touching interrupt line and base address registers
* directly, use the values stored here. They might be different!
*/
unsigned int irq;
struct resource resource[DEVICE_COUNT_RESOURCE]; /* I/O and memory regions + expansion ROMs */

/* ... */
};

pci_read_irq函数就是将Configuration Space的Interrupt Pin和Interrupt Line寄存器的值读出,并分别赋值给dev->pindev->irq

Interrupt Pin取0表示禁用INTx中断,取1表示INTA#信号线,取2表示INTB#信号线,取3表示INTC#信号线,取4表示INTD#信号线。Interrupt Line表示INTx中断接到中断控制器(x86平台即IOAPIC)的哪一个IRQ引脚。

pci_read_bases函数就是将BAR读出,用BAR配置的地址空间初始化dev->resource[i]。对于PCI Device,有BAR0 - BAR5外加一个expansion ROM总共7个resource,对于PCI Bridge,只有BAR0和BAR1,加上expansion ROM总共有3个resource。

值得注意的是,pci_read_bridge_windows只是检查了PCI Bridge的IO Window、Prefetchable Memory Window是否有效,若有效则设置dev->io_window = 1dev->pref_window = 1,并未根据Window设置resource。那么IO Window、Memory Window等对应的resource是在哪里初始化的呢,答案是在pci_setup_bridge_iopci_setup_bridge_mmiopci_setup_bridge_mmio_pref三个函数中初始化的。通常会在pci_scan_child_bus完成后,pci_bus_add_devices执行前,调用pci_bus_assign_resources,它会递归地遍历所有总线,并对所有总线调用上述三个函数:

1
2
3
4
5
6
7
pci_bus_assign_resources
--> __pci_bus_assign_resources --> __pci_bus_assign_resources --> ... --> __pci_bus_assign_resources
--> pci_setup_bridge
--> __pci_setup_bridge
--> pci_setup_bridge_io
--> pci_setup_bridge_mmio
--> pci_setup_bridge_mmio_pref

另外,上文rescan的实现中出现的pci_assign_unassigned_bus_resources以及pci_assign_unassigned_bridge_resources,最终也会调用到pci_setup_bridge,重新配置其IO和Memory Window的resource。


pci_setup_device执行完毕后,下一步是调用pci_device_addpci_dev添加到其所属的总线pci_bus上,这个函数主要仍是在对pci_dev进行一些初始化工作,并将pci_dev添加到了pci_busdevices链表中,最后调用了device_add将pci_dev注册到sysfs:

1
2
3
4
5
6
7
8
9
10
11
12
/*
* Add the device to our list of discovered devices
* and the bus list for fixup functions, etc.
*/
down_write(&pci_bus_sem);
list_add_tail(&dev->bus_list, &bus->devices);
up_write(&pci_bus_sem);

/* Notifier could use PCI capabilities */
dev->match_driver = false;
ret = device_add(&dev->dev);
WARN_ON(ret < 0);

第二步中的pci_scan_bridge_extend实现比较复杂,不过去除我们不关心的细节后,其核心逻辑还是很清晰的:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
child = pci_find_bus(pci_domain_nr(bus), secondary);
if (!child) {
child = pci_add_new_bus(bus, dev, secondary);
if (!child)
goto out;
child->primary = primary;
pci_bus_insert_busn_res(child, secondary, subordinate);
child->bridge_ctl = bctl;
}

cmax = pci_scan_child_bus(child);
if (cmax > subordinate)
pci_warn(dev, "bridge has subordinate %02x but max busn %02x\n",
subordinate, cmax);

通过pci_add_new_bus创建子bus的pci_bus对象,并挂到父bus下,然后递归扫描子bus即可。pci_add_new_bus调用了pci_alloc_child_bus来创建子bus,然后将子bus添加到了父bus的children链表中。

pci_alloc_child_bus的实现基本和pci_register_host_bridge类似,不多展开,不过它增加了一个hook,即child->ops->add_bus,在child bus创建并初始化完毕后被调用:

1
2
3
4
5
if (child->ops->add_bus) {
ret = child->ops->add_bus(child);
if (WARN_ON(ret < 0))
dev_err(&child->dev, "failed to add bus: %d\n", ret);
}

PCI Device Probing

Matching and Dynamic ID

编写驱动时,主要是实现一个pci_driver对象,其定义如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
struct pci_driver {
struct list_head node;
const char *name;
const struct pci_device_id *id_table; /* Must be non-NULL for probe to be called */
int (*probe)(struct pci_dev *dev, const struct pci_device_id *id); /* New device inserted */
void (*remove)(struct pci_dev *dev); /* Device removed (NULL if not a hot-plug capable driver) */
int (*suspend)(struct pci_dev *dev, pm_message_t state); /* Device suspended */
int (*suspend_late)(struct pci_dev *dev, pm_message_t state);
int (*resume_early)(struct pci_dev *dev);
int (*resume)(struct pci_dev *dev); /* Device woken up */
void (*shutdown)(struct pci_dev *dev);
int (*sriov_configure)(struct pci_dev *dev, int num_vfs); /* On PF */
const struct pci_error_handlers *err_handler;
const struct attribute_group **groups;
struct device_driver driver;
struct pci_dynids dynids;
};

我们在驱动模块(Kernel Module)的初始化函数中,要调用pci_register_driver(drv)将驱动注册到sysfs中。上面已经说过,PCI子系统在Device Model中的Bus是pci_bus_type,这个pci_register_driver仅仅是将Driver注册到该Bus上:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
int __pci_register_driver(struct pci_driver *drv, struct module *owner,
const char *mod_name)
{
/* initialize common driver fields */
drv->driver.name = drv->name;
drv->driver.bus = &pci_bus_type;
drv->driver.owner = owner;
drv->driver.mod_name = mod_name;
drv->driver.groups = drv->groups;

spin_lock_init(&drv->dynids.lock);
INIT_LIST_HEAD(&drv->dynids.list);

/* register with core */
return driver_register(&drv->driver);
}

我们知道,编写驱动必须实现pci_driver中的回调,不考虑电源管理的回调,最基本的就是probe函数,它会在Device和Driver成功匹配后被调用。

我们先来看pci_bus_type中定义的match回调pci_bus_match,它调用pci_match_device实现其功能:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
static const struct pci_device_id *pci_match_device(struct pci_driver *drv,
struct pci_dev *dev)
{
struct pci_dynid *dynid;
const struct pci_device_id *found_id = NULL;

/* When driver_override is set, only bind to the matching driver */
if (dev->driver_override && strcmp(dev->driver_override, drv->name))
return NULL;

/* Look at the dynamic ids first, before the static ones */
spin_lock(&drv->dynids.lock);
list_for_each_entry(dynid, &drv->dynids.list, node) {
if (pci_match_one_device(&dynid->id, dev)) {
found_id = &dynid->id;
break;
}
}
spin_unlock(&drv->dynids.lock);

if (!found_id)
found_id = pci_match_id(drv->id_table, dev);

/* driver_override will always match, send a dummy id */
if (!found_id && dev->driver_override)
found_id = &pci_device_id_any;

return found_id;
}

首先,PCI Driver在创建时(一般是静态分配)设置的id_table会用于和PCI Device的Device ID、Vendor ID、Class Code、Subsystem ID和Subsystem Vendor ID进行匹配。

此外,用户可以在系统运行时动态为PCI Driver添加Dynamic ID,这些动态添加的ID存放在drv->dynids,并会优先用于和PCI Device的匹配。要添加动态ID很简单,向/sys/bus/pci/drivers/<driver name>/new_id写入即可:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
struct pci_dynid {
struct list_head node;
struct pci_device_id id;
};

int pci_add_dynid(struct pci_driver *drv,
unsigned int vendor, unsigned int device,
unsigned int subvendor, unsigned int subdevice,
unsigned int class, unsigned int class_mask,
unsigned long driver_data)
{
struct pci_dynid *dynid;

dynid = kzalloc(sizeof(*dynid), GFP_KERNEL);
if (!dynid)
return -ENOMEM;

dynid->id.vendor = vendor;
dynid->id.device = device;
dynid->id.subvendor = subvendor;
dynid->id.subdevice = subdevice;
dynid->id.class = class;
dynid->id.class_mask = class_mask;
dynid->id.driver_data = driver_data;

spin_lock(&drv->dynids.lock);
list_add_tail(&dynid->node, &drv->dynids.list);
spin_unlock(&drv->dynids.lock);

return driver_attach(&drv->driver);
}

static ssize_t new_id_store(struct device_driver *driver, const char *buf,
size_t count)
{
/* ... */

fields = sscanf(buf, "%x %x %x %x %x %x %lx",
&vendor, &device, &subvendor, &subdevice,
&class, &class_mask, &driver_data);
if (fields < 2)
return -EINVAL;

/* ... */

retval = pci_add_dynid(pdrv, vendor, device, subvendor, subdevice,
class, class_mask, driver_data);
if (retval)
return retval;
return count;
}
static DRIVER_ATTR_WO(new_id);

从上述代码可见,写入new_id的操作除了向drv->dynids链表添加一个新元素,还会立即引起Driver和新添加的ID对应的Device(如果当前存在)的绑定。与new_id相对应,PCI Driver还有一个remove_id属性,写入remove_id能将ID从drv->dynids链表中删除,但不会unbind ID对应的Device。

Probing and Enabling

一旦完成了匹配,就可以将相互匹配的Device和Driver绑定,这是在pci_bus_type的probe回调,即pci_device_probe中实现的。最终,会在local_pci_probe中调用Driver的probe回调:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
/* pci_device_probe
--> __pci_device_probe
--> pci_call_probe
--> local_pci_probe
*/
static long local_pci_probe(void *_ddi)
{
struct drv_dev_and_id *ddi = _ddi;
struct pci_dev *pci_dev = ddi->dev;
struct pci_driver *pci_drv = ddi->drv;
struct device *dev = &pci_dev->dev;
int rc;

/*
* Unbound PCI devices are always put in D0, regardless of
* runtime PM status. During probe, the device is set to
* active and the usage count is incremented. If the driver
* supports runtime PM, it should call pm_runtime_put_noidle(),
* or any other runtime PM helper function decrementing the usage
* count, in its probe routine and pm_runtime_get_noresume() in
* its remove routine.
*/
pm_runtime_get_sync(dev);
pci_dev->driver = pci_drv;
rc = pci_drv->probe(pci_dev, ddi->id);
if (!rc)
return rc;
if (rc < 0) {
pci_dev->driver = NULL;
pm_runtime_put_sync(dev);
return rc;
}
/*
* Probe function should return < 0 for failure, 0 for success
* Treat values > 0 as success, but warn.
*/
dev_warn(dev, "Driver probe function unexpectedly returned %d\n", rc);
return 0;
}

在完成binding后,还需要调用pci_enable_device(dev)启用设备,之所以需要这一步是因为初始状态下设备是禁用的,需要设置Configuration Space中的Command Register启用。真正的工作是在pci_enable_device_flags中完成的,它首先递归地启用设备所属的Bus,然后启用设备本身:

1
2
3
4
5
pci_enable_device
--> pci_enable_device_flags
--> pci_enable_bridge
--> pci_enable_device --> ... --> pci_enable_bridge --> pci_enable_device
--> do_pci_enable_device

真正启用Device的代码位于do_pci_enable_device,它最终会调用到pci_enable_resources,写入Command Register启用设备:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
/* do_pci_enable_device
--> pcibios_enable_device
--> pci_enable_resources
*/
int pci_enable_resources(struct pci_dev *dev, int mask)
{
u16 cmd, old_cmd;
int i;
struct resource *r;

pci_read_config_word(dev, PCI_COMMAND, &cmd);
old_cmd = cmd;

for (i = 0; i < PCI_NUM_RESOURCES; i++) {
if (!(mask & (1 << i)))
continue;

r = &dev->resource[i];

if (!(r->flags & (IORESOURCE_IO | IORESOURCE_MEM)))
continue;
if ((i == PCI_ROM_RESOURCE) &&
(!(r->flags & IORESOURCE_ROM_ENABLE)))
continue;

if (r->flags & IORESOURCE_UNSET) {
pci_err(dev, "can't enable device: BAR %d %pR not assigned\n",
i, r);
return -EINVAL;
}

if (!r->parent) {
pci_err(dev, "can't enable device: BAR %d %pR not claimed\n",
i, r);
return -EINVAL;
}

if (r->flags & IORESOURCE_IO)
cmd |= PCI_COMMAND_IO;
if (r->flags & IORESOURCE_MEM)
cmd |= PCI_COMMAND_MEMORY;
}

if (cmd != old_cmd) {
pci_info(dev, "enabling device (%04x -> %04x)\n", old_cmd, cmd);
pci_write_config_word(dev, PCI_COMMAND, cmd);
}
return 0;
}