Kernel: 2.6.10-rc2
Finished: 01/01/05
/*
* Activate the first processor.
*/
asmlinkage void __init start_kernel(void)
{
char * command_line;
extern struct kernel_param __start___param[], __stop___param[];
/*
* Interrupts are still disabled. Do necessary setups, then
* enable them
*/
lock_kernel(); /* 给kernel上锁 */
page_address_init(); /* 在配置highmem才作工作 */
printk(linux_banner); /* 打印kernel版本信息 */
setup_arch(&command_line); /* 设置体系结构相关信息,包括页面映射,acpi等 */
setup_per_cpu_areas(); /* 设置smp中每个cpu区域偏移量信息 */
/*
* Mark the boot cpu "online" so that it can call console drivers in
* printk() and can access its per-cpu storage. 设置引导cpu在工作状态
*/
smp_prepare_boot_cpu();
/*
* Set up the scheduler prior starting any interrupts (such as the
* timer interrupt). Full topology setup happens at smp_init()
* time - but meanwhile we still have a functioning scheduler.
*/
sched_init(); /* runqueue设置 */
build_all_zonelists(); /* 建立分配策略 */
page_alloc_init(); /* hotplug CPU设置 */
printk("Kernel command line: %s\n", saved_command_line);
parse_early_param();
parse_args("Booting kernel", command_line, __start___param,
__stop___param - __start___param,
&unknown_bootoption); /* 对传入内核参数作分析,并作相应设置 */
sort_main_extable(); /* 异常处理调用函数表排序 */
trap_init(); /* 重新设置中断向量表 */
rcu_init(); /* 初始化RCU(Read-Copy Update),主要是一个per_cpu_rcu_tasklet */
init_IRQ(); /* 中断服务队列初始化,但没有具体中断处理函数入口,在request_irq()向系统注册 */
pidhash_init(); /* pidhash表初始化,共5个,是不是每个表中保存不同类型pid? */
init_timers(); /* 初始化一个per_cpu_tvec_bases队列,并设置TIMER_SOFTIRQ */
softirq_init(); /* 初始化软中断和tasklet */
time_init(); /* 硬件时钟及其中断初始化 */
/*
* HACK ALERT! This is early. We're enabling the console before
* we've done PCI setups etc, and console_init() must be aware of
* this. But we do want output early, in case something goes wrong.
*/
console_init();
if (panic_later)
panic(panic_later, panic_param);
profile_init(); /* profile设置 */
local_irq_enable(); /* 开中断 */
#ifdef CONFIG_BLK_DEV_INITRD
if (initrd_start && !initrd_below_start_ok &&
initrd_start < min_low_pfn << PAGE_SHIFT) {
printk(KERN_CRIT "initrd overwritten (0x%08lx < 0x%08lx) - "
"disabling it.\n",initrd_start,min_low_pfn << PAGE_SHIFT);
initrd_start = 0;
}
#endif
vfs_caches_init_early(); /* 初始化dentry和inode缓冲队列的hash表 */
mem_init(); /* 最后内存初始化,释放前边标志为保留的所有页面 */
kmem_cache_init(); /* slab初始化 */
numa_policy_init(); /* ?????????????????????? */
if (late_time_init)
late_time_init();
calibrate_delay(); /* 计算BogoMIPS */
pidmap_init(); /* 初始化pid位图 */
pgtable_cache_init(); /* pgd,pmd slab初始化 */
prio_tree_init(); /* 初始化index_bits_to_maxindex,For (struct page)->mapping->i_map*/
anon_vma_init(); /* anon_vma slab初始化,用于对rmap支持 */
#ifdef CONFIG_X86
if (efi_enabled)
efi_enter_virtual_mode();
#endif
fork_init(num_physpages); /* 计算系统最大安全进程数,设置当前进程最大进程数 */
proc_caches_init(); /* 其他slab初始化 */
buffer_init(); /* buffer head初始化 */
unnamed_dev_init(); /* ?????what is idr????? */
security_init(); /* security 初始化 */
vfs_caches_init(num_physpages); /* **vfs需要的cache初始化** */
radix_tree_init(); /* radix_tree初始化,该功能主要加速look up dirty or writeback pages */
signals_init(); /* 创建sigqueue slab */
/* rootfs populating might need page-writeback */
page_writeback_init(); /* 计算当前系统vm-radio等,设置是否需要回写操作 */
#ifdef CONFIG_PROC_FS
proc_root_init(); /* proc文件系统初始化,并根据配置建立相应的目录和文件 */
#endif
check_bugs();
acpi_early_init(); /* before LAPIC and SMP init */
/* Do the rest non-__init'ed, we're now alive */
rest_init(); /* 建立init进程 */
}
/* arch/i386/kernel/setup.c */
/*
* Determine if we were loaded by an EFI loader. If so, then we have also been
* passed the efi memmap, systab, etc., so we should use these data structures
* for initialization. Note, the efi init code path is determined by the
* global efi_enabled. This allows the same kernel image to be used on existing
* systems (with a traditional BIOS) as well as on EFI systems.
* 检测是否是通过EFI引导kernel.如果是,将通过efi导入memmap, systab等,因此用此数据
* 结构进行初始化。
* Note: efi初始化路径是在全觉efi_enabled决定的(是否配置efi_enable?)。
*/
void __init setup_arch(char **cmdline_p)
{
unsigned long max_low_pfn;
memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
pre_setup_arch_hook(); /* 执行某些体系结构相关的hook程序, i386是空 */
early_cpu_init(); /* 设置获取的cpu信息 */
/*
* FIXME: This isn't an official loader_type right
* now but does currently work with elilo.
* If we were configured as an EFI kernel, check to make
* sure that we were loaded correctly from elilo and that
* the system table is valid. If not, then initialize normally.
*/
#ifdef CONFIG_EFI
if ((LOADER_TYPE == 0x50) && EFI_SYSTAB)
efi_enabled = 1;
#endif
/* 从setup中取得BIOS自检后取得的信息,复制到内核内存空间中(原来保存在一个临时页面中) */
ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV);
drive_info = DRIVE_INFO;
screen_info = SCREEN_INFO;
edid_info = EDID_INFO;
apm_info.bios = APM_BIOS_INFO;
ist_info = IST_INFO;
saved_videomode = VIDEO_MODE;
if( SYS_DESC_TABLE.length != 0 ) {
MCA_bus = SYS_DESC_TABLE.table[3] &0x2;
machine_id = SYS_DESC_TABLE.table[0];
machine_submodel_id = SYS_DESC_TABLE.table[1];
BIOS_revision = SYS_DESC_TABLE.table[2];
}
aux_device_present = AUX_DEVICE_INFO;
#ifdef CONFIG_BLK_DEV_RAM
rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK;
rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0);
rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0);
#endif
ARCH_SETUP /* x86系列没有任何的动作 */
if (efi_enabled)
efi_init();
else {
printk(KERN_INFO "BIOS-provided physical RAM map:\n");
print_memory_map(machine_specific_memory_setup()); /* 处理内存图,最后保存在e820中 */
}
copy_edd(); /* 复制增强磁盘参数(来之setup自检信息),实验性质,CONFIG_EDD */
if (!MOUNT_ROOT_RDONLY)
root_mountflags &= ~MS_RDONLY;
init_mm.start_code = (unsigned long) _text;
init_mm.end_code = (unsigned long) _etext;
init_mm.end_data = (unsigned long) _edata;
init_mm.brk = init_pg_tables_end + PAGE_OFFSET;
code_resource.start = virt_to_phys(_text);
code_resource.end = virt_to_phys(_etext)-1;
data_resource.start = virt_to_phys(_etext);
data_resource.end = virt_to_phys(_edata)-1;
parse_cmdline_early(cmdline_p); /* 分析引导时用户提供的启动参数(例如mem=xxx,acpi=xx,and so on) */
max_low_pfn = setup_memory(); /* 为页面映射作基础工作(生成map) */
/*
* NOTE: before this point _nobody_ is allowed to allocate 到现在依然不可以用bootmem内存分配器来
* any memory using the bootmem allocator. Although the 分配内存,在执行paging_init()以前必须
* alloctor is now initialised only the first 8Mb of the kernel 用alloc_bootmem_low_pages()来分配内存
* virtual address space has been mapped. All allocations before
* paging_init() has completed must use the alloc_bootmem_low_pages()
* variant (which allocates DMA'able memory) and care must be taken
* not to exceed the 8Mb limit.
*/
#ifdef CONFIG_SMP
smp_alloc_memory(); /* AP processor realmode stacks in low memory 为启动smp其他cpu分配内存 */
#endif
paging_init(); /* 页面信息初始化 */
/*
* NOTE: at this point the bootmem allocator is fully available.
*/
#ifdef CONFIG_EARLY_PRINTK
{
char *s = strstr(*cmdline_p, "earlyprintk=");
if (s) {
extern void setup_early_printk(char *);
setup_early_printk(s);
printk("early console enabled\n");
}
}
#endif
dmi_scan_machine(); /* DMI=Desktop Management Interface */
#ifdef CONFIG_X86_GENERICARCH
generic_apic_probe(*cmdline_p); /* 检测APIC(高级可编程中断器) */
#endif
if (efi_enabled)
efi_map_memmap();
/*
* Parse the ACPI tables for possible boot-time SMP configuration.
*/
acpi_boot_init();
#ifdef CONFIG_X86_LOCAL_APIC
if (smp_found_config)
get_smp_config();
#endif
register_memory(max_low_pfn); /* 对系统I/O资源生成资源树 */
#ifdef CONFIG_VT
#if defined(CONFIG_VGA_CONSOLE)
if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
conswitchp = &vga_con;
#elif defined(CONFIG_DUMMY_CONSOLE)
conswitchp = &dummy_con;
#endif
#endif
}
/* arch/i386/kernel/cpu/common.c */
void __init early_cpu_init(void)
{
/* 目前支持9中x386系列cpu,分别赋值给cpu_devs */
intel_cpu_init(); /* Intel CPU结构赋值 */
cyrix_init_cpu();
nsc_init_cpu();
amd_init_cpu();
centaur_init_cpu();
transmeta_init_cpu();
rise_init_cpu();
nexgen_init_cpu();
umc_init_cpu();
early_cpu_detect(); /* 检测cpu信息,并将检测得到信息给boot_cpu_data */
#ifdef CONFIG_DEBUG_PAGEALLOC
/* pse is not compatible with on-the-fly unmapping,
* disable it even if the cpus claim to support it.
*/
clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
disable_pse = 1;
#endif
}
/* arch/i386/kernel/setup.c */
static void __init print_memory_map(char *who)
{
int i;
for (i = 0; i < e820.nr_map; i++) {
printk(" %s: %016Lx - %016Lx ", who,
e820.map.addr,
e820.map.addr + e820.map.size);
switch (e820.map.type) {
case E820_RAM: printk("(usable)\n");
break;
case E820_RESERVED:
printk("(reserved)\n");
break;
case E820_ACPI:
printk("(ACPI data)\n");
break;
case E820_NVS:
printk("(ACPI NVS)\n");
break;
default: printk("type %lu\n", e820.map.type);
break;
}
}
}
/* arch/i386/kernel/setup.c */
static void __init parse_cmdline_early (char ** cmdline_p)
{
char c = ' ', *to = command_line, *from = saved_command_line;
int len = 0;
int userdef = 0;
/* Save unparsed command line copy for /proc/cmdline */
saved_command_line[COMMAND_LINE_SIZE-1] = '\0';
for (;;) {
/*
* "mem=nopentium" disables the 4MB page tables.
* "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
* to , overriding the bios size.
* "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
* to +, overriding the bios size.
*
* HPA tells me bootloaders need to parse mem=, so no new
* option should be mem= [also see Documentation/i386/boot.txt]
*/
if (c == ' ' && !memcmp(from, "mem=", 4)) {
if (to != command_line)
to--;
if (!memcmp(from+4, "nopentium", 9)) {
from += 9+4;
clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
disable_pse = 1;
} else {
/* If the user specifies memory size, we
* limit the BIOS-provided memory map to
* that size. exactmap can be used to specify
* the exact map. mem=number can be used to
* trim the existing memory map.
*/
unsigned long long mem_size;
mem_size = memparse(from+4, &from);
limit_regions(mem_size);
userdef=1;
}
}
if (c == ' ' && !memcmp(from, "memmap=", 7)) {
if (to != command_line)
to--;
if (!memcmp(from+7, "exactmap", 8)) {
from += 8+7;
e820.nr_map = 0;
userdef = 1;
} else {
/* If the user specifies memory size, we
* limit the BIOS-provided memory map to
* that size. exactmap can be used to specify
* the exact map. mem=number can be used to
* trim the existing memory map.
*/
unsigned long long start_at, mem_size;
mem_size = memparse(from+7, &from);
if (*from == '@') {
start_at = memparse(from+1, &from);
add_memory_region(start_at, mem_size, E820_RAM);
} else if (*from == '#') {
start_at = memparse(from+1, &from);
add_memory_region(start_at, mem_size, E820_ACPI);
} else if (*from == '$') {
start_at = memparse(from+1, &from);
add_memory_region(start_at, mem_size, E820_RESERVED);
} else {
limit_regions(mem_size);
userdef=1;
}
}
}
#ifdef CONFIG_X86_SMP
/*
* If the BIOS enumerates physical processors before logical,
* maxcpus=N at enumeration-time can be used to disable HT.
*/
else if (!memcmp(from, "maxcpus=", 8)) {
extern unsigned int maxcpus;
maxcpus = simple_strtoul(from + 8, NULL, 0);
}
#endif
#ifdef CONFIG_ACPI_BOOT
/* "acpi=off" disables both ACPI table parsing and interpreter */
else if (!memcmp(from, "acpi=off", 8)) {
disable_acpi();
}
/* acpi=force to over-ride black-list */
else if (!memcmp(from, "acpi=force", 10)) {
acpi_force = 1;
acpi_ht = 1;
acpi_disabled = 0;
}
/* acpi=strict disables out-of-spec workarounds */
else if (!memcmp(from, "acpi=strict", 11)) {
acpi_strict = 1;
}
/* Limit ACPI just to boot-time to enable HT */
else if (!memcmp(from, "acpi=ht", 7)) {
if (!acpi_force)
disable_acpi();
acpi_ht = 1;
}
/* "pci=noacpi" disable ACPI IRQ routing and PCI scan */
else if (!memcmp(from, "pci=noacpi", 10)) {
acpi_disable_pci();
}
/* "acpi=noirq" disables ACPI interrupt routing */
else if (!memcmp(from, "acpi=noirq", 10)) {
acpi_noirq_set();
}
else if (!memcmp(from, "acpi_sci=edge", 13))
acpi_sci_flags.trigger = 1;
else if (!memcmp(from, "acpi_sci=level", 14))
acpi_sci_flags.trigger = 3;
else if (!memcmp(from, "acpi_sci=high", 13))
acpi_sci_flags.polarity = 1;
else if (!memcmp(from, "acpi_sci=low", 12))
acpi_sci_flags.polarity = 3;
#ifdef CONFIG_X86_IO_APIC
else if (!memcmp(from, "acpi_skip_timer_override", 24))
acpi_skip_timer_override = 1;
#endif
#ifdef CONFIG_X86_LOCAL_APIC
/* disable IO-APIC */
else if (!memcmp(from, "noapic", 6))
disable_ioapic_setup();
#endif /* CONFIG_X86_LOCAL_APIC */
#endif /* CONFIG_ACPI_BOOT */
/*
* highmem=size forces highmem to be exactly 'size' bytes.使用用户定义的highmem大小
* This works even on boxes that have no highmem otherwise.即使配置内核没有选择此选项
* This also works to reduce highmem size on bigger boxes.如果选择此选项也可能减少hignmem大小
*/
if (c == ' ' && !memcmp(from, "highmem=", 8))
highmem_pages = memparse(from+8, &from) >> PAGE_SHIFT;
/*
* vmalloc=size forces the vmalloc area to be exactly 'size'
* bytes. This can be used to increase (or decrease) the
* vmalloc area - the default is 128m. 用户指定vmalloc大小代替缺省128m
*/
if (c == ' ' && !memcmp(from, "vmalloc=", 8))
__VMALLOC_RESERVE = memparse(from+8, &from);
c = *(from++);
if (!c)
break;
if (COMMAND_LINE_SIZE <= ++len)
break;
*(to++) = c;
}
*to = '\0';
*cmdline_p = command_line;
if (userdef) {
printk(KERN_INFO "user-defined physical RAM map:\n");
print_memory_map("user");
}
}
static unsigned long __init setup_memory(void)
{
unsigned long bootmap_size, start_pfn, max_low_pfn;
/*
* partially used pages are not usable - thus
* we are rounding upwards:
*/
start_pfn = PFN_UP(init_pg_tables_end);
find_max_pfn();
max_low_pfn = find_max_low_pfn();
#ifdef CONFIG_HIGHMEM
highstart_pfn = highend_pfn = max_pfn;
if (max_pfn > max_low_pfn) {
highstart_pfn = max_low_pfn;
}
printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
pages_to_mb(highend_pfn - highstart_pfn));
#endif
printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
pages_to_mb(max_low_pfn));
/*
* Initialize the boot-time allocator (with low memory only):
*/
bootmap_size = init_bootmem(start_pfn, max_low_pfn); /* 设置此区间页面为保留,好像结果在node_data[0]->bdata */
register_bootmem_low_pages(max_low_pfn); /* 设置所有可以使用内存页面位图 */
/*
* Reserve the bootmem bitmap itself as well. We do this in two
* steps (first step was init_bootmem()) because this catches
* the (very unlikely) case of us accidentally initializing the
* bootmem allocator with an invalid RAM area.
*/
reserve_bootmem(HIGH_MEMORY, (PFN_PHYS(start_pfn) +
bootmap_size + PAGE_SIZE-1) - (HIGH_MEMORY)); /* 保留内核在内存中的映像 */
/*
* reserve physical page 0 - it's a special BIOS page on many boxes,
* enabling clean reboots, SMP operation, laptop functions.
*/
reserve_bootmem(0, PAGE_SIZE); /* 保留物理页面0, 主要是和启动有关的信息以及bios信息 */
/* reserve EBDA region, it's a 4K region */
reserve_ebda_region();
/* could be an AMD 768MPX chipset. Reserve a page before VGA to prevent
PCI prefetch into it (errata #56). Usually the page is reserved anyways,
unless you have no PS/2 mouse plugged in. */
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
boot_cpu_data.x86 == 6)
reserve_bootmem(0xa0000 - 4096, 4096);
#ifdef CONFIG_SMP
/*
* But first pinch a few for the stack/trampoline stuff
* FIXME: Don't need the extra page at 4K, but need to fix
* trampoline before removing it. (see the GDT stuff)
*/
reserve_bootmem(PAGE_SIZE, PAGE_SIZE); /* 在SMP系统中需要使用 */
#endif
#ifdef CONFIG_ACPI_SLEEP
/*
* Reserve low memory region for sleep support.
*/
acpi_reserve_bootmem();
#endif
#ifdef CONFIG_X86_FIND_SMP_CONFIG
/*
* Find and reserve possible boot-time SMP configuration:
*/
find_smp_config();
#endif
#ifdef CONFIG_BLK_DEV_INITRD
if (LOADER_TYPE && INITRD_START) {
if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) {
reserve_bootmem(INITRD_START, INITRD_SIZE);
initrd_start =
INITRD_START ? INITRD_START + PAGE_OFFSET : 0;
initrd_end = initrd_start+INITRD_SIZE;
}
else {
printk(KERN_ERR "initrd extends beyond end of memory "
"(0x%08lx > 0x%08lx)\ndisabling initrd\n",
INITRD_START + INITRD_SIZE,
max_low_pfn << PAGE_SHIFT);
initrd_start = 0;
}
}
#endif
return max_low_pfn;
}
/* arch/i386/mm/init.c */
/*
* paging_init() sets up the page tables - note that the first 8MB are
* already mapped by head.S.
* 已经有8MB内存在head.S中映射完成
* This routines also unmaps the page at virtual kernel address 0, so
* that we can trap those pesky NULL-reference errors in the kernel.
*/
void __init paging_init(void)
{
#ifdef CONFIG_X86_PAE
set_nx();
if (nx_enabled)
printk("NX (Execute Disable) protection: active\n");
#endif
pagetable_init(); /* 修改系统空间页面表信息,原来在系统setup时已经设置好,但都是空的 */
load_cr3(swapper_pg_dir);
#ifdef CONFIG_X86_PAE
/*
* We will bail out later - printk doesn't work right now so
* the user would just see a hanging kernel.
*/
if (cpu_has_pae)
set_in_cr4(X86_CR4_PAE);
#endif
__flush_tlb_all(); /* 刷新mmu */
kmap_init(); /* highmem使用内存设定 */
zone_sizes_init(); /* 内存初始化 pgdat_list->zone */
}
static void __init pagetable_init (void)
{
unsigned long vaddr;
pgd_t *pgd_base = swapper_pg_dir;
#ifdef CONFIG_X86_PAE /* 用三级页面映射表(Physical Address Extension) */
int i;
/* Init entries of the first-level page table to the zero page */
for (i = 0; i < PTRS_PER_PGD; i++) /* PTRS_PER_PGD=4 */
set_pgd(pgd_base + i, __pgd(__pa(empty_zero_page) | _PAGE_PRESENT));
#endif
/* Enable PSE if available(Page Size Extensions)4MB页面表 */
if (cpu_has_pse) {
set_in_cr4(X86_CR4_PSE);
}
/* Enable PGE if available (PTE Global Bit)*/
if (cpu_has_pge) {
set_in_cr4(X86_CR4_PGE);
__PAGE_KERNEL |= _PAGE_GLOBAL;
__PAGE_KERNEL_EXEC |= _PAGE_GLOBAL;
}
kernel_physical_mapping_init(pgd_base); /* 系统空间映射(0xC0000000..=>0-max_low_pfn) */
remap_numa_kva(); /* 重新初始化numa的内核虚拟地址空间???? */
/*
* Fixed mappings, only the page table structure has to be
* created - mappings will be set by set_fixmap(): 固定使用的地址
*/
vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; /* 在内存高地址区域 */
page_table_range_init(vaddr, 0, pgd_base); /* 页面表固定地址初始化,包括acpi地址等 */
permanent_kmaps_init(pgd_base); /* 固定地址初始化(pkmap),此地址干什么用?????是不是用作highmem分配使用 */
#ifdef CONFIG_X86_PAE
/*
* Add low memory identity-mappings - SMP needs it when
* starting up on an AP from real-mode. In the non-PAE
* case we already have these mappings through head.S.
* All user-space mappings are explicitly cleared after
* SMP startup.
*/
pgd_base[0] = pgd_base[USER_PTRS_PER_PGD];
#endif
}
/*
* This maps the physical memory to kernel virtual address space, a total
* of max_low_pfn pages, by creating page tables starting from address
* PAGE_OFFSET.(映射物理内存到系统空间虚拟地址,共max_low_pfn页面,从0xc0000000地址开始)
*/
static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
{
unsigned long pfn;
pgd_t *pgd;
pmd_t *pmd;
pte_t *pte;
int pgd_idx, pmd_idx, pte_ofs;
pgd_idx = pgd_index(PAGE_OFFSET); /* 映射开始地址是系统空间 */
pgd = pgd_base + pgd_idx;
pfn = 0;
for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
pmd = one_md_table_init(pgd); /* 初始化二级目录表 */
if (pfn >= max_low_pfn)
continue;
for (pmd_idx = 0; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) {
unsigned int address = pfn * PAGE_SIZE + PAGE_OFFSET;
/* Map with big pages if possible, otherwise create normal page tables. */
if (cpu_has_pse) { /* 4MB页面表初始化,如果用此,将没有第三级页面 */
unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1;
if (is_kernel_text(address) || is_kernel_text(address2))
set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC));
else
set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE));
pfn += PTRS_PER_PTE;
} else {
pte = one_page_table_init(pmd);
for (pte_ofs = 0; pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; pte++, pfn++, pte_ofs++) {
if (is_kernel_text(address))
set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC));
else
set_pte(pte, pfn_pte(pfn, PAGE_KERNEL));
}
}
}
}
}
void __init sched_init(void)
{
runqueue_t *rq;
int i, j, k;
/* 初始化每个cpu运行队列 */
for (i = 0; i < NR_CPUS; i++) {
prio_array_t *array;
rq = cpu_rq(i);
spin_lock_init(&rq->lock);
rq->active = rq->arrays; /* 活动队列 */
rq->expired = rq->arrays + 1; /* 过期队列 */
rq->best_expired_prio = MAX_PRIO; /* 优先级最低 */
#ifdef CONFIG_SMP
rq->sd = &sched_domain_dummy;
rq->cpu_load = 0; /* cpu负载 */
rq->active_balance = 0; /* ???? */
rq->push_cpu = 0; /* ???? */
rq->migration_thread = NULL;
INIT_LIST_HEAD(&rq->migration_queue);
#endif
atomic_set(&rq->nr_iowait, 0);
for (j = 0; j < 2; j++) {
array = rq->arrays + j;
for (k = 0; k < MAX_PRIO; k++) {
INIT_LIST_HEAD(array->queue + k);
__clear_bit(k, array->bitmap);
}
// delimiter for bitsearch
__set_bit(MAX_PRIO, array->bitmap);
}
}
/*
* The boot idle thread does lazy MMU switching as well:
*/
atomic_inc(&init_mm.mm_count);
enter_lazy_tlb(&init_mm, current);
/*
* Make us the idle thread. Technically, schedule() should not be
* called from this thread, however somewhere below it might be,
* but because we are the idle thread, we just pick up running again
* when this runqueue becomes "idle".
*/
init_idle(current, smp_processor_id()); /* 设置idel进程,并将runqueue中curr指向该进程 */
}
void __init trap_init(void) /* 中断向量重新设置(在初始化时设置指向ignore_int) */
{
#ifdef CONFIG_EISA
if (isa_readl(0x0FFFD9) == 'E'+('I'<<8)+('S'<<16)+('A'<<24)) {
EISA_bus = 1;
}
#endif
#ifdef CONFIG_X86_LOCAL_APIC
init_apic_mappings();
#endif
set_trap_gate(0,÷_error); /* 陷阱门设置 */
set_intr_gate(1,&debug); /* 中断门设置 */
set_intr_gate(2,&nmi);
set_system_intr_gate(3, &int3); /* int3-5 can be called from all */
set_system_gate(4,&overflow);
set_system_gate(5,&bounds);
set_trap_gate(6,&invalid_op);
set_trap_gate(7,&device_not_available);
set_task_gate(8,GDT_ENTRY_DOUBLEFAULT_TSS);
set_trap_gate(9,&coprocessor_segment_overrun);
set_trap_gate(10,&invalid_TSS);
set_trap_gate(11,&segment_not_present);
set_trap_gate(12,&stack_segment);
set_trap_gate(13,&general_protection);
set_intr_gate(14,&page_fault);
set_trap_gate(15,&spurious_interrupt_bug);
set_trap_gate(16,&coprocessor_error);
set_trap_gate(17,&alignment_check);
#ifdef CONFIG_X86_MCE
set_trap_gate(18,&machine_check);
#endif
set_trap_gate(19,&simd_coprocessor_error);
set_system_gate(SYSCALL_VECTOR,&system_call); /* 系统调用中断设置 */
/*
* Should be a barrier for any external CPU state.
*/
cpu_init(); /* 重新装入gdt,ldt */
trap_init_hook(); /* do nothing on i386 */
}
void __init init_IRQ(void)
{
int i;
/* all the set up before the call gates are initialised */
pre_intr_init_hook(); /* 中断请求队列初始化 */
/*
* Cover the whole vector space, no vector can escape 设置中断向量
* us. (some of these will be overridden and become
* 'special' SMP interrupts)
*/
for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
int vector = FIRST_EXTERNAL_VECTOR + i;
if (i >= NR_IRQS)
break;
if (vector != SYSCALL_VECTOR)
set_intr_gate(vector, interrupt);
}
/* setup after call gates are initialised (usually add in
* the architecture specific gates) 在系统调用初始化完毕后特殊设置,和结构相关
*/
intr_init_hook();
/*
* Set the clock to HZ Hz, we already have a valid
* vector now: 设置时钟hz
*/
setup_pit_timer();
/*
* External FPU? Set up irq13 if so, for
* original braindamaged IBM FERR coupling.
*/
if (boot_cpu_data.hard_math && !cpu_has_fpu)
setup_irq(FPU_IRQ, &fpu_irq);
irq_ctx_init(smp_processor_id());
}
void __init mem_init(void)
{
extern int ppro_with_ram_bug(void); /* 检测pentium是否是有bug的cpu */
int codesize, reservedpages, datasize, initsize;
int tmp;
int bad_ppro;
#ifndef CONFIG_DISCONTIGMEM
if (!mem_map)
BUG();
#endif
bad_ppro = ppro_with_ram_bug();
#ifdef CONFIG_HIGHMEM
/* check that fixmap and pkmap do not overlap 确认fixmap和pkmap没有重叠 */
if (PKMAP_BASE+LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
printk(KERN_ERR "fixmap and kmap areas overlap - this will crash\n");
printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n",
PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, FIXADDR_START);
BUG();
}
#endif
set_max_mapnr_init(); /* 设置highmem区域 */
#ifdef CONFIG_HIGHMEM
high_memory = (void *) __va(highstart_pfn * PAGE_SIZE);
#else
high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
#endif
/* this will put all low memory onto the freelists,根据页面位图释放内存中所有可供动态分配的页面 */
totalram_pages += __free_all_bootmem();
reservedpages = 0;
for (tmp = 0; tmp < max_low_pfn; tmp++)
/*
* Only count reserved RAM pages
*/
if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
reservedpages++;
set_highmem_pages_init(bad_ppro);
codesize = (unsigned long) &_etext - (unsigned long) &_text;
datasize = (unsigned long) &_edata - (unsigned long) &_etext;
initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); /* 初始化kcore_mem,应该是实际内存? */
kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
VMALLOC_END-VMALLOC_START); /* 虚拟内存初始化 */
printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n",
(unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
num_physpages << (PAGE_SHIFT-10),
codesize >> 10,
reservedpages << (PAGE_SHIFT-10),
datasize >> 10,
initsize >> 10,
(unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
);
#ifdef CONFIG_X86_PAE
if (!cpu_has_pae)
panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!");
#endif
if (boot_cpu_data.wp_works_ok < 0)
test_wp_bit();
/*
* Subtle. SMP is doing it's boot stuff late (because it has to
* fork idle threads) - but it also needs low mappings for the
* protected-mode entry to work. We zap these entries only after
* the WP-bit has been tested.
*/
#ifndef CONFIG_SMP
zap_low_mappings();
#endif
}
/* Initialisation.
* Called after the gfp() functions have been enabled, and before smp_init().
*/
void __init kmem_cache_init(void)
{
size_t left_over;
struct cache_sizes *sizes;
struct cache_names *names;
/*
* Fragmentation(分裂) resistance(阻力) on low memory - only use bigger
* page orders on machines with more than 32MB of memory.
*/
if (num_physpages > (32 << 20) >> PAGE_SHIFT) /* 系统有多于32MB内存 */
slab_break_gfp_order = BREAK_GFP_ORDER_HI;
/* Bootstrap is tricky, because several objects are allocated
* from caches that do not exist yet:
* 1) initialize the cache_cache cache: it contains the kmem_cache_t
* structures of all caches, except cache_cache itself: cache_cache
* is statically allocated.
* Initially an __init data area is used for the head array, it's
* replaced with a kmalloc allocated array at the end of the bootstrap.
* 2) Create the first kmalloc cache.
* The kmem_cache_t for the new cache is allocated normally. An __init
* data area is used for the head array.
* 3) Create the remaining kmalloc caches, with minimally sized head arrays.
* 4) Replace the __init data head arrays for cache_cache and the first
* kmalloc cache with kmalloc allocated arrays.
* 5) Resize the head arrays of the kmalloc caches to their final sizes.
*/
/* 1) create the cache_cache */
init_MUTEX(&cache_chain_sem); /* 初始化cache链表信号量 */
INIT_LIST_HEAD(&cache_chain); /* 初始化cache链表 */
list_add(&cache_cache.next, &cache_chain); /* 是不是把自己加入到队列头???? */
cache_cache.colour_off = cache_line_size(); /* 128 */
cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
cache_cache.objsize = ALIGN(cache_cache.objsize, cache_line_size());
cache_estimate(0, cache_cache.objsize, cache_line_size(), 0,
&left_over, &cache_cache.num);
if (!cache_cache.num)
BUG();
cache_cache.colour = left_over/cache_cache.colour_off;
cache_cache.colour_next = 0;
cache_cache.slab_size = ALIGN(cache_cache.num*sizeof(kmem_bufctl_t) +
sizeof(struct slab), cache_line_size());
/* 2+3) create the kmalloc caches */
sizes = malloc_sizes;
names = cache_names;
while (sizes->cs_size) {
/* For performance, all the general caches are L1 aligned.
* This should be particularly beneficial on SMP boxes, as it
* eliminates "false sharing".
* Note for systems short on memory removing the alignment will
* allow tighter packing of the smaller caches. */
sizes->cs_cachep = kmem_cache_create(names->name,
sizes->cs_size, ARCH_KMALLOC_MINALIGN,
(ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL);
/* Inc off-slab bufctl limit until the ceiling is hit. */
if (!(OFF_SLAB(sizes->cs_cachep))) {
offslab_limit = sizes->cs_size-sizeof(struct slab);
offslab_limit /= sizeof(kmem_bufctl_t);
}
sizes->cs_dmacachep = kmem_cache_create(names->name_dma,
sizes->cs_size, ARCH_KMALLOC_MINALIGN,
(ARCH_KMALLOC_FLAGS | SLAB_CACHE_DMA | SLAB_PANIC),
NULL, NULL);
sizes++;
names++;
}
/* 4) Replace the bootstrap head arrays */
{
void * ptr;
ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
local_irq_disable();
BUG_ON(ac_data(&cache_cache) != &initarray_cache.cache);
memcpy(ptr, ac_data(&cache_cache), sizeof(struct arraycache_init));
cache_cache.array[smp_processor_id()] = ptr;
local_irq_enable();
ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
local_irq_disable();
BUG_ON(ac_data(malloc_sizes[0].cs_cachep) != &initarray_generic.cache);
memcpy(ptr, ac_data(malloc_sizes[0].cs_cachep),
sizeof(struct arraycache_init));
malloc_sizes[0].cs_cachep->array[smp_processor_id()] = ptr;
local_irq_enable();
}
/* 5) resize the head arrays to their final sizes */
{
kmem_cache_t *cachep;
down(&cache_chain_sem);
list_for_each_entry(cachep, &cache_chain, next)
enable_cpucache(cachep); /* 激活cpu缓存 */
up(&cache_chain_sem);
}
/* Done! */
g_cpucache_up = FULL;
/* Register a cpu startup notifier callback
* that initializes ac_data for all new cpus
*/
register_cpu_notifier(&cpucache_notifier);
/* The reap timers are started later, with a module init call:
* That part of the kernel is not yet operational.
*/
}
void __init pidmap_init(void)
{
int i;
pidmap_array->page = (void *)get_zeroed_page(GFP_KERNEL);
set_bit(0, pidmap_array->page);
atomic_dec(&pidmap_array->nr_free);
/*
* Allocate PID 0, and hash it via all PID types:
*/
for (i = 0; i < PIDTYPE_MAX; i++) /* 将当前进程加入到hash表中.pid,pgid,tgid,sid */
attach_pid(current, i, 0);
}
/*
* We need to finalize in a non-__init function or else race conditions
* between the root thread and the init thread may cause start_kernel to
* be reaped by free_initmem before the root thread has proceeded to
* cpu_idle.
*
* gcc-3.4 accidentally inlines this function, so use noinline.
*/
static void noinline rest_init(void)
__releases(kernel_lock)
{
kernel_thread(init, NULL, CLONE_FS | CLONE_SIGHAND); /* 启动init内核进程 */
numa_default_policy();
unlock_kernel();
cpu_idle();
}
static int init(void * unused)
{
lock_kernel();
/*
* Tell the world that we're going to be the grim
* reaper of innocent orphaned children. 所有进程的父进程
*
* We don't want people to have to make incorrect
* assumptions about where in the task array this
* can be found.
*/
child_reaper = current;
/* Sets up cpus_possible() */
smp_prepare_cpus(max_cpus); /*主cpu会依次启动各个从cpu。见smp_boot_cpus->do_boot_cpu()*/
do_pre_smp_initcalls(); /* 启动migration_thread,ksoftirqd等CPU进程 */
fixup_cpu_present_map();
smp_init(); /* 主要设置APIC */
sched_init_smp();
/*
* Do this before initcalls, because some drivers want to access
* firmware files.
*/
populate_rootfs(); /* 生成initrd文件 */
do_basic_setup();
/*
* check if there is an early userspace init. If yes, let it do all
* the work
*/
if (sys_access((const char __user *) "/init", 0) == 0)
execute_command = "/init";
else
prepare_namespace(); /* 装载initrd,安装模块,mount根文件系统 */
/*
* Ok, we have completed the initial bootup, and
* we're essentially up and running. Get rid of the
* initmem segments and start the user-mode stuff..
*/
free_initmem();
unlock_kernel();
system_state = SYSTEM_RUNNING;
numa_default_policy();
if (sys_open((const char __user *) "/dev/console", O_RDWR, 0) < 0)
printk("Warning: unable to open an initial console.\n");
(void) sys_dup(0);
(void) sys_dup(0);
/*
* We try each of these until one succeeds.
*
* The Bourne shell can be used instead of init if we are
* trying to recover a really broken machine.
*/
if (execute_command)
run_init_process(execute_command);
run_init_process("/sbin/init");
run_init_process("/etc/init");
run_init_process("/bin/init");
run_init_process("/bin/sh");
panic("No init found. Try passing init= option to kernel.");
}