返回列表 发帖

[站长原创] 我的Linux学习笔记之内存管理

[站长原创] 我的Linux学习笔记之内存管理

Linux内存管理


osboy 原创文章


嵌入式开发联盟


mcuos.com@gmail.com


1.
Linux页框管理1.1.
Arm的分页机制
Linux arm采用4k1M字节作为一个页框大小。
ARM MMU内存管理的两个主要作用:
(1)安全性:规定访问权限

(2)
提供地址空间:把不连续的空间转换成连续的。


页表变换过程:





内核空间:
swapper_pg_dir内核页目录表来寻址,包含0xc00000000—end_mem之间的内存映射,是实际的物理内存的大小。一般来说这个映射只需要一个偏移量即可,这里的偏移量为0xc00000000。因为我们把物理内存映射为0x00000000开始。
用户空间:
通过用户页目录指针得到页表,他们是动态建立的。
1.2.
Linux页描述符Linux的页描述符page管理的物理内存,他把物理内存划分为4kbyte大小的页框,用页描述符记录这些页框的状态,并统一放到struct page *mem_map数组中

页框:
页:
页帧号:




arch/arm/mach-xxx/include/mach/memory.h中有,
#define PHYS_OFFSET
UL(0x00000000)


arch/arm/include/asm/memory.h中有:
#define PHYS_PFN_OFFSET
(PHYS_OFFSET >> PAGE_SHIFT)

#define PAGE_SHIFT
12

#define ARCH_PFN_OFFSET
PHYS_PFN_OFFSET


Include/asm-generic/memory-model.h中有:
#if defined(CONFIG_FLATMEM)
#define __pfn_to_page(pfn)
(mem_map + ((pfn) - ARCH_PFN_OFFSET))

#define __page_to_pfn(page)
((unsigned long)((page) - mem_map) + \



ARCH_PFN_OFFSET)


#define page_to_pfn __page_to_pfn
#define pfn_to_page __pfn_to_page

struct page *mem_map;

/*

* Each physical page in the system has a struct page associated with


* it to keep track of whatever it is we are using the page for at the


* moment. Note that we have no way to track which tasks are using


* a page, though if it is a pagecache page, rmap structures can tell us


* who is mapping it.


*/

struct page {

unsigned long flags;
/* Atomic flags, some possibly



* updated asynchronously */


atomic_t _count;
/* Usage count, see below. */


union {


atomic_t _mapcount;
/* Count of ptes mapped in mms,



* to show when page is mapped



* & limit reverse map searches.



*/


struct {
/* SLUB */


u16 inuse;


u16 objects;


};


};


union {



struct {


unsigned long private;
/* Mapping-private opaque data:





* usually used for buffer_heads



* if PagePrivate set; used for



* swp_entry_t if PageSwapCache;



* indicates order in the buddy



* system if PG_buddy is set.



*/


struct address_space *mapping;
/* If low bit clear, points to



* inode address_space, or NULL.



* If page mapped as anonymous



* memory, low bit is set, and



* it points to anon_vma object:



* see PAGE_MAPPING_ANON below.



*/



};

#if USE_SPLIT_PTLOCKS


spinlock_t ptl;

#endif


struct kmem_cache *slab;
/* SLUB: Pointer to slab */




struct page *first_page;
/* Compound tail pages */


};


union {


pgoff_t index;
/* Our offset within mapping. */


void *freelist;
/* SLUB: freelist req. slab lock */


};


struct list_head lru;
/* Pageout list, eg. active_list



* protected by zone->lru_lock !



*/


/*



* On machines where all RAM is mapped into kernel address space,



* we can simply calculate the virtual address. On machines with



* highmem some memory is mapped into kernel virtual memory



* dynamically, so we need a place to store that address.



* Note that this field could be 16 bits on x86 ... ;)



*



* Architectures with slow multiplication can define



* WANT_PAGE_VIRTUAL in asm/page.h



*/

#if defined(WANT_PAGE_VIRTUAL)

void *virtual;
/* Kernel virtual address (NULL if



not kmapped, ie. highmem) */

#endif /* WANT_PAGE_VIRTUAL */
#ifdef CONFIG_WANT_PAGE_DEBUG_FLAGS

unsigned long debug_flags;
/* Use atomic bitops on this */

#endif

#ifdef CONFIG_KMEMCHECK

/*



* kmemcheck wants to track the status of each byte in a page; this



* is a pointer to such a status block. NULL if not tracked.



*/


void *shadow;

#endif
};


1.3.
物理内存布局


arm/arm/kernelsetup.c中有函数setup_arch,他调用:


init_mm.start_code = (unsigned long) _text;


init_mm.end_code
= (unsigned long) _etext;


init_mm.end_data
= (unsigned long) _edata;


init_mm.brk

= (unsigned long) _end;



arm_memblock_init(&meminfo, mdesc);


void __init arm_memblock_init(struct meminfo *mi, struct machine_desc *mdesc)
{

int i;



memblock_init();


for (i = 0; i < mi->nr_banks; i++)


memblock_add(mi->bank.start, mi->bank.size);



/* Register the kernel text, kernel data and initrd with memblock. */

#ifdef CONFIG_XIP_KERNEL

memblock_reserve(__pa(_data), _end - _data);

注释:如果是XIP内核的话,只保留数据空间,代码空间可以在存储介质上直接运行,所以没有必要拷贝到内存中。
#else

memblock_reserve(__pa(_stext), _end - _stext);

注释:预留出kernel的内存空间
#endif
#ifdef CONFIG_BLK_DEV_INITRD

if (phys_initrd_size) {


memblock_reserve(phys_initrd_start, phys_initrd_size);

注释:预留出文件系统的空间

/* Now convert initrd to virtual addresses */


initrd_start = __phys_to_virt(phys_initrd_start);


initrd_end = initrd_start + phys_initrd_size;


}

#endif


arm_mm_memblock_reserve();



/* reserve any platform specific memblock areas */


if (mdesc->reserve)


mdesc->reserve();



memblock_analyze();


memblock_dump_all();

}

上面这个图是根据内核的链接脚本文件来定义的:

OUTPUT_ARCH(arm)
ENTRY(stext)
jiffies = jiffies_64;
SECTIONS
{

. = 0xC0000000 + 0x00008000;


.init : { /* Init code and data
*/



_stext = .;//0xc0008000


_sinittext = .;


*(.head.text)


*(.init.text) *(.cpuinit.text) *(.meminit.text)


_einittext = .;


__proc_info_begin = .;


*(.proc.info.init)


__proc_info_end = .;


__arch_info_begin = .;


*(.arch.info.init)


__arch_info_end = .;


__tagtable_begin = .;


*(.taglist.init)


__tagtable_end = .;


. = ALIGN(16); __setup_start = .; *(.init.setup) __setup_end = .;


__initcall_start = .; *(.initcallearly.init) __early_initcall_end = .; *(.initcall0.init) *(.initcall0s.init) *(.initcall1.init) *(.initcall1s.init) *(.initcall2.init) *(.initcall2s.init) *(.initcall3.init) *(.initcall3s.init) *(.initcall4.init) *(.initcall4s.init) *(.initcall5.init) *(.initcall5s.init) *(.initcallrootfs.init) *(.initcall6.init) *(.initcall6s.init) *(.initcall7.init) *(.initcall7s.init) __initcall_end = .;


__con_initcall_start = .; *(.con_initcall.init) __con_initcall_end = .;


__security_initcall_start = .; *(.security_initcall.init) __security_initcall_end = .;


. = ALIGN((1 << 12)); __initramfs_start = .; *(.init.ramfs) __initramfs_end = .;


__init_begin = _stext;


*(.init.data) *(.cpuinit.data) *(.meminit.data) . = ALIGN(8); __ctors_start = .; *(.ctors) __ctors_end = .; *(.init.rodata) *(.cpuinit.rodata) *(.meminit.rodata)


}


. = ALIGN((1 << 12)); .data..percpu : AT(ADDR(.data..percpu) - 0) { __per_cpu_load = .; __per_cpu_start = .; *(.data..percpu..first) *(.data..percpu..page_aligned) *(.data..percpu) *(.data..percpu..shared_aligned) __per_cpu_end = .; }


. = ALIGN((1 << 12));


__init_end = .;//
c0021000


/*



* unwind exit sections must be discarded before the rest of the



* unwind sections get included.



*/


/DISCARD/ : {


*(.ARM.exidx.exit.text)


*(.ARM.extab.exit.text)


*(.ARM.exidx.cpuexit.text)


*(.ARM.extab.cpuexit.text)


}


.text : { /* Real text segment
*/



_text = .;
c0021000 /* Text and read-only data
*/


__exception_text_start = .;


*(.exception.text)


__exception_text_end = .;


. = ALIGN(8); *(.text.hot) *(.text) *(.ref.text) *(.devinit.text) *(.devexit.text) *(.text.unlikely)


. = ALIGN(8); __sched_text_start = .; *(.sched.text) __sched_text_end = .;


. = ALIGN(8); __lock_text_start = .; *(.spinlock.text) __lock_text_end = .;


. = ALIGN(8); __kprobes_text_start = .; *(.kprobes.text) __kprobes_text_end = .;


*(.fixup)


*(.gnu.warning)


*(.rodata)


*(.rodata.*)


*(.glue_7)


*(.glue_7t)


*(.got) /* Global offset table
*/


}


. = ALIGN(((1 << 12))); .rodata : AT(ADDR(.rodata) - 0) { __start_rodata = .; *(.rodata) *(.rodata.*) *(__vermagic) *(__markers_strings) *(__tracepoints_strings) } .rodata1 : AT(ADDR(.rodata1) - 0) { *(.rodata1) } .pci_fixup : AT(ADDR(.pci_fixup) - 0) { __start_pci_fixups_early = .; *(.pci_fixup_early) __end_pci_fixups_early = .; __start_pci_fixups_header = .; *(.pci_fixup_header) __end_pci_fixups_header = .; __start_pci_fixups_final = .; *(.pci_fixup_final) __end_pci_fixups_final = .; __start_pci_fixups_enable = .; *(.pci_fixup_enable) __end_pci_fixups_enable = .; __start_pci_fixups_resume = .; *(.pci_fixup_resume) __end_pci_fixups_resume = .; __start_pci_fixups_resume_early = .; *(.pci_fixup_resume_early) __end_pci_fixups_resume_early = .; __start_pci_fixups_suspend = .; *(.pci_fixup_suspend) __end_pci_fixups_suspend = .; } .builtin_fw : AT(ADDR(.builtin_fw) - 0) { __start_builtin_fw = .; *(.builtin_fw) __end_builtin_fw = .; } .rio_ops : AT(ADDR(.rio_ops) - 0) { __start_rio_switch_ops = .; *(.rio_switch_ops) __end_rio_switch_ops = .; } __ksymtab : AT(ADDR(__ksymtab) - 0) { __start___ksymtab = .; *(__ksymtab) __stop___ksymtab = .; } __ksymtab_gpl : AT(ADDR(__ksymtab_gpl) - 0) { __start___ksymtab_gpl = .; *(__ksymtab_gpl) __stop___ksymtab_gpl = .; } __ksymtab_unused : AT(ADDR(__ksymtab_unused) - 0) { __start___ksymtab_unused = .; *(__ksymtab_unused) __stop___ksymtab_unused = .; } __ksymtab_unused_gpl : AT(ADDR(__ksymtab_unused_gpl) - 0) { __start___ksymtab_unused_gpl = .; *(__ksymtab_unused_gpl) __stop___ksymtab_unused_gpl = .; } __ksymtab_gpl_future : AT(ADDR(__ksymtab_gpl_future) - 0) { __start___ksymtab_gpl_future = .; *(__ksymtab_gpl_future) __stop___ksymtab_gpl_future = .; } __kcrctab : AT(ADDR(__kcrctab) - 0) { __start___kcrctab = .; *(__kcrctab) __stop___kcrctab = .; } __kcrctab_gpl : AT(ADDR(__kcrctab_gpl) - 0) { __start___kcrctab_gpl = .; *(__kcrctab_gpl) __stop___kcrctab_gpl = .; } __kcrctab_unused : AT(ADDR(__kcrctab_unused) - 0) { __start___kcrctab_unused = .; *(__kcrctab_unused) __stop___kcrctab_unused = .; } __kcrctab_unused_gpl : AT(ADDR(__kcrctab_unused_gpl) - 0) { __start___kcrctab_unused_gpl = .; *(__kcrctab_unused_gpl) __stop___kcrctab_unused_gpl = .; } __kcrctab_gpl_future : AT(ADDR(__kcrctab_gpl_future) - 0) { __start___kcrctab_gpl_future = .; *(__kcrctab_gpl_future) __stop___kcrctab_gpl_future = .; } __ksymtab_strings : AT(ADDR(__ksymtab_strings) - 0) { *(__ksymtab_strings) } __init_rodata : AT(ADDR(__init_rodata) - 0) { *(.ref.rodata) *(.devinit.rodata) *(.devexit.rodata) } __param : AT(ADDR(__param) - 0) { __start___param = .; *(__param) __stop___param = .; . = ALIGN(((1 << 12))); __end_rodata = .; } . = ALIGN(((1 << 12)));


_etext = .;//
c01c2000 /* End of text and rodata section */


/*



* Stack unwinding tables



*/


. = ALIGN(8);


.ARM.unwind_idx : {


__start_unwind_idx = .;


*(.ARM.exidx*)


__stop_unwind_idx = .;


}


.ARM.unwind_tab : {


__start_unwind_tab = .;


*(.ARM.extab*)


__stop_unwind_tab = .;


}


. = ALIGN(8192);


__data_loc = .;


.data : AT(__data_loc) {


_data = .; //c01d2000 /* address in memory */


_sdata = .;


/*



* first, the init task union, aligned



* to an 8192 byte boundary.



*/


. = ALIGN(8192); *(.data..init_task)


. = ALIGN((1 << 12)); __nosave_begin = .; *(.data..nosave) . = ALIGN((1 << 12)); __nosave_end = .;


. = ALIGN(32); *(.data..cacheline_aligned)


/*



* The exception fixup table (might need resorting at runtime)



*/


. = ALIGN(32);


__start___ex_table = .;


*(__ex_table)


__stop___ex_table = .;


/*



* and the usual data section



*/


*(.data) *(.ref.data) *(.devinit.data) *(.devexit.data) . = ALIGN(8); __start___markers = .; *(__markers) __stop___markers = .; . = ALIGN(32); __start___tracepoints = .; *(__tracepoints) __stop___tracepoints = .; . = ALIGN(8); __start___verbose = .; *(__verbose) __stop___verbose = .; . = ALIGN(32); . = ALIGN(32);


CONSTRUCTORS


_edata = .;//
c01e4660


}


_edata_loc = __data_loc + SIZEOF(.data);


. = ALIGN(0); __bss_start = .; . = ALIGN(0); .sbss : AT(ADDR(.sbss) - 0) { *(.sbss) *(.scommon) } . = ALIGN(0); .bss : AT(ADDR(.bss) - 0) { *(.bss..page_aligned) *(.dynbss) *(.bss) *(COMMON) } . = ALIGN(0); __bss_stop = .;


_end = .;//
c021236c


.stab 0 : { *(.stab) } .stabstr 0 : { *(.stabstr) } .stab.excl 0 : { *(.stab.excl) } .stab.exclstr 0 : { *(.stab.exclstr) } .stab.index 0 : { *(.stab.index) } .stab.indexstr 0 : { *(.stab.indexstr) } .comment 0 : { *(.comment) }


.comment 0 : { *(.comment) }


/* Default discards */


/DISCARD/ : { *(.exit.text) *(.cpuexit.text) *(.memexit.text) *(.exit.data) *(.cpuexit.data) *(.cpuexit.rodata) *(.memexit.data) *(.memexit.rodata) *(.exitcall.exit) *(.discard) }

}
/*

* These must never be empty


* If you have to comment these two assert statements out, your


* binutils is too old (for other reasons as well)


*/

ASSERT((__proc_info_end - __proc_info_begin), "missing CPU support")
ASSERT((__arch_info_end - __arch_info_begin), "no machine record defined")


ADR,LDR基础知识:

ARM
汇编有ldr指令以及ldradr伪指令,他门都可以将标号表达式作为操作数,下面通过分析一段代码以及对应的反汇编结果来说明它们的区别。
     ldr     r0, _start
      adr     r0, _start
       ldr     r0, =_start
_start:
        b  _start
        
编译的时候设置 RO 0x30000000,下面是反汇编的结果:
   0x00000000: e59f0004  ldr r0, [pc, #4] ; 0xc
   0x00000004: e28f0000  add r0, pc, #0 ; 0x0
   0x00000008: e59f0000  ldr r0, [pc, #0] ; 0x10
   0x0000000c: eafffffe  b 0xc
   0x00000010: 3000000c  andcc r0, r0, ip

1ldr     r0, _start
    这是一条指令,从内存地址 _start 的位置把值读入。
在这里_start是一个标号(是一个相对程序的表达式),汇编程序计算相对于 PC 的偏移量,并生成相对于 PC的前索引的指令:ldr r0, [pc, #4]。执行指令后,r0 = 0xeafffffe
    ldr r0, _start是根据_start对当前PC的相对位置读取其所在地址的值,因此可以在和_start标号的相对位置不变的情况下移动。
2adr     r0, _start
    这是一条伪指令,总是会被汇编程序汇编为一个指令。汇编程序尝试产生单个 ADD SUB 指令来装载该地址。如果不能在一个指令中构造该地址,则生成一个错误,并且汇编失败。
    在这里是取得标号_start 的地址到 r0,因为地址是相对程序的,因此ADR产生依赖于位置的代码,在此例中被汇编成:add r0, pc, #0。因此该代码可以在和标号相对位置不变的情况下移动;
    假如这段代码在 0x30000000 运行,那么 adr r0, _start 得到 r0 = 0x3000000c;如果在地址 0 运行,就是 0x0000000c 了。
    通过这一点可以判断程序在什么地方运行。U-boot中那段relocate代码就是通过adr实现当前程序是在RAM中还是flash中,下面进行简要分析。

relocate: /* U-Boot重新定位到RAM */
    adr r0, _start /* r0
是代码的当前位置 */
/* adr
伪指令,汇编器自动通过当前PC的值算出 如果执行到_startPC的值,放到r0中:
当此段在flash中执行时r0 = _start = 0;当此段在RAM中执行时_start = _TEXT_BASE(board/smdk2410/config.mk中指定的值为0x30000000,即u-boot在把代码拷贝到RAM中去执行的代码段的开始) */
    ldr r1, _TEXT_BASE /*
测试判断是从Flash启动,还是RAM */
/*
此句执行的结果r1始终是0x30000000,因为此值是又编译器指定的(ads中设置,或-D设置编译器参数) */
    cmp r0, r1 /*
比较r0r1,调试的时候不要执行重定位 */
3ldr     r0, =_start
    这是一条伪指令,是一个相对程序的或外部的表达式。汇编程序将相对程序的标号表达式 label-expr 的值放在一个文字池中,并生成一个相对程序的 LDR 指令来从文字池中装载该值,在此例中生成的指令为:ldr r0, [pc, #0],对应文字池中的地址以及值为:0x00000010: 3000000c。如果 label-expr 是一个外部表达式,或者未包含于当前段内,则汇编程序在目标文件中放置一个链接程序重定位命令。链接程序在链接时生成地址。
    因此取得的是标号 _start 的绝对地址,这个绝对地址(运行地址)是在连接的时候确定的。它要占用 2 32bit 的空间,一条是指令,另一条是文字池中存放_start 的绝对地址。因此可以看出,不管这段代码将来在什么地方运行,它的结果都是 r0 = 0x3000000c。由于ldr r0, =_start取得的是_start的绝对地址,这句代码可以在_start标号的绝对位置不变的情况下移动;如果使用寄存器pc在程序中可以实现绝对转移。

Linux内核的临时页表建立过程:

__create_page_tables:

pgtbl
r4
@ page table address


注释:
这里r4=0x4000

.macro
pgtbl, rd


ldr
\rd, =(KERNEL_RAM_PADDR - 0x4000)


.endm


#define KERNEL_RAM_PADDR
(PHYS_OFFSET + TEXT_OFFSET)


arch/arm/mach-xxxx/include/mach/memory.h中有:

#define PHYS_OFFSET
UL(0x00000000)




/*



* Clear the 16K level 1 swapper page table



*/


mov
r0, r4 //r0 = r4 = 0x4000


mov
r3, #0 //r3 = 0x0


add
r6, r0, #0x4000 // r6 = r0 + 0x4000 = 0x8000

1:
str
r3, [r0], #4


str
r3, [r0], #4


str
r3, [r0], #4


str
r3, [r0], #4


teq
r0, r6


bne
1b

注释:清除0x4000-0x8000的空间为0x0


ldr
r7, [r10, #PROCINFO_MM_MMUFLAGS] @ mm_mmuflags



/*



* Create identity mapping for first MB of kernel to



* cater for the MMU enable.
This identity mapping



* will be removed by paging_init().
We use our current program



* counter to determine corresponding section base address.



*/


mov
r6, pc //
把当前的PC值给r6寄存器,假设当前PC值为0x800c

mov
r6, r6, lsr #20
@ start of kernel section //r6 = 0x0
,右移20

orr
r3, r7, r6, lsl #20
@ flags + kernel base //
再把r6左移20位,加上 mmu mmflags r7

str
r3, [r4, r6, lsl #2]
@ identity mapping //
r3的值存放到以r4为基地址,r6为索引的页表里面。


/*



* Now setup the pagetables for our kernel direct



* mapped region.



*/


add
r0, r4,
#(KERNEL_START & 0xff000000) >> 18


str
r3, [r0, #(KERNEL_START & 0x00f00000) >> 18]!


ldr
r6, =(KERNEL_END - 1)


add
r0, r0, #4


add
r6, r4, r6, lsr #18

1:
cmp
r0, r6


add
r3, r3, #1 << 20


strls
r3, [r0], #4


bls
1b

注释:
map整个内核的数据段、代码段空间到0xc0000000开始的虚拟地址空间

#define KERNEL_START
KERNEL_RAM_VADDR

#define KERNEL_END
_end


#define KERNEL_RAM_VADDR
(PAGE_OFFSET + TEXT_OFFSET) = 0xc0008000




#ifdef CONFIG_XIP_KERNEL

/*



* Map some ram to cover our .data and .bss areas.



*/


orr
r3, r7, #(KERNEL_RAM_PADDR & 0xff000000)


.if
(KERNEL_RAM_PADDR & 0x00f00000)


orr
r3, r3, #(KERNEL_RAM_PADDR & 0x00f00000)


.endif


add
r0, r4,
#(KERNEL_RAM_VADDR & 0xff000000) >> 18


str
r3, [r0, #(KERNEL_RAM_VADDR & 0x00f00000) >> 18]!


ldr
r6, =(_end - 1)


add
r0, r0, #4


add
r6, r4, r6, lsr #18

1:
cmp
r0, r6


add
r3, r3, #1 << 20


strls
r3, [r0], #4


bls
1b

#endif


/*



* Then map first 1MB of ram in case it contains our boot params.



*/


add
r0, r4, #PAGE_OFFSET >> 18


orr
r6, r7, #(PHYS_OFFSET & 0xff000000)


.if
(PHYS_OFFSET & 0x00f00000)


orr
r6, r6, #(PHYS_OFFSET & 0x00f00000)


.endif


str
r6, [r0]

注释重新把0-1M处的sdram map0xc0000000-0xc0100000处。
#ifdef CONFIG_DEBUG_LL

ldr
r7, [r10, #PROCINFO_IO_MMUFLAGS] @ io_mmuflags


/*



* Map in IO space for serial debugging.



* This allows debug messages to be output



* via a serial console before paging_init.



*/


ldr
r3, [r8, #MACHINFO_PGOFFIO]


add
r0, r4, r3


rsb
r3, r3, #0x4000
@ PTRS_PER_PGD*sizeof(long)


cmp
r3, #0x0800
@ limit to 512MB


movhi
r3, #0x0800


add
r6, r0, r3


ldr
r3, [r8, #MACHINFO_PHYSIO]


orr
r3, r3, r7

1:
str
r3, [r0], #4


add
r3, r3, #1 << 20


teq
r0, r6


bne
1b

#if defined(CONFIG_ARCH_NETWINDER) || defined(CONFIG_ARCH_CATS)

/*



* If we're using the NetWinder or CATS, we also need to map



* in the 16550-type serial port for the debug messages



*/


add
r0, r4, #0xff000000 >> 18


orr
r3, r7, #0x7c000000


str
r3, [r0]

#endif
#ifdef CONFIG_ARCH_RPC

/*



* Map in screen at 0x02000000 & SCREEN2_BASE



* Similar reasons here - for debug.
This is



* only for Acorn RiscPC architectures.



*/


add
r0, r4, #0x02000000 >> 18


orr
r3, r7, #0x02000000


str
r3, [r0]


add
r0, r4, #0xd8000000 >> 18


str
r3, [r0]

#endif
#endif

mov
pc, lr

ENDPROC(__create_page_tables)

.ltorg





第一步,只一对一map1M大小的空间到0x0开始的虚拟地址空间。

第二步map整个内核的数据段代码段空间到0xc0000000开始的虚拟地址空间。内核的stext位置为0x8000处。

第一步,只一对一map1M大小的空间到0xc0000000开始的虚拟地址空间。

1.3.1.
物理内存数据结构初始化Meminfo结构体描述实际的物理内存情况,起始地址,实际内存大小,是否属于highmem

Arch/arm/include/asm中有定义:
struct membank {

unsigned long start;


unsigned long size;


unsigned int highmem;

};

struct meminfo {

int nr_banks;


struct membank bank[NR_BANKS];

};

extern struct meminfo meminfo;

linux定义了一些有用的宏:
#define for_each_bank(iter,mi)
\


for (iter = 0; iter < (mi)->nr_banks; iter++)


#define bank_pfn_start(bank)
__phys_to_pfn((bank)->start)

#define bank_pfn_end(bank)
__phys_to_pfn((bank)->start + (bank)->size)

#define bank_pfn_size(bank)
((bank)->size >> PAGE_SHIFT)

#define bank_phys_start(bank)
(bank)->start

#define bank_phys_end(bank)
((bank)->start + (bank)->size)

#define bank_phys_size(bank)
(bank)->size


Arch/arm/mach- RealView为例子加以说明:

MACHINE_START(REALVIEW_EB, "ARM-RealView EB")

/* Maintainer: ARM Ltd/Deep Blue Solutions Ltd */


.phys_io
= REALVIEW_EB_UART0_BASE & SECTION_MASK,


.io_pg_offst
= (IO_ADDRESS(REALVIEW_EB_UART0_BASE) >> 18) & 0xfffc,


.boot_params
= PHYS_OFFSET + 0x00000100,


.fixup
= realview_fixup,


.map_io
= realview_eb_map_io,


.init_irq
= gic_init_irq,


.timer
= &realview_eb_timer,


.init_machine
= realview_eb_init,

MACHINE_END

/*

* Set of macros to define architecture features.
This is built into


* a table by the linker.


*/

#define MACHINE_START(_type,_name)
\

static const struct machine_desc __mach_desc_##_type
\


__used
\


__attribute__((__section__(".arch.info.init"))) = {
\


.nr
= MACH_TYPE_##_type,
\


.name
= _name,


#define MACHINE_END
\

};

struct machine_desc {

/*



* Note! The first four elements are used



* by assembler code in head.S, head-common.S



*/


unsigned int
nr;
/* architecture number
*/


unsigned int
nr_irqs;
/* number of IRQs */


unsigned int
phys_io;
/* start of physical io
*/


unsigned int
io_pg_offst;
/* byte offset for io



* page tabe entry
*/



const char
*name;
/* architecture name
*/


unsigned long
boot_params;
/* tagged list
*/



unsigned int
video_start;
/* start of video RAM
*/


unsigned int
video_end;
/* end of video RAM
*/



unsigned int
reserve_lp0 :1;
/* never has lp0
*/


unsigned int
reserve_lp1 :1;
/* never has lp1
*/


unsigned int
reserve_lp2 :1;
/* never has lp2
*/


unsigned int
soft_reboot :1;
/* soft reboot
*/


void
(*fixup)(struct machine_desc *,



struct tag *, char **,



struct meminfo *);


void
(*reserve)(void);/* reserve mem blocks
*/


void
(*map_io)(void);/* IO mapping function
*/


void
(*init_irq)(void);


struct sys_timer
*timer;
/* system tick timer
*/


void
(*init_machine)(void);

};



/*

* Setup the memory banks.


*/

void realview_fixup(struct machine_desc *mdesc, struct tag *tags, char **from,


struct meminfo *meminfo)

{

/*



* Most RealView platforms have 512MB contiguous RAM at 0x70000000.



* Half of this is mirrored at 0.



*/

#ifdef CONFIG_REALVIEW_HIGH_PHYS_OFFSET

meminfo->bank[0].start = 0x70000000;


meminfo->bank[0].size = SZ_512M;


meminfo->nr_banks = 1;

#else

meminfo->bank[0].start = 0;


meminfo->bank[0].size = SZ_256M;


meminfo->nr_banks = 1;

#endif
}

如果不用这个fixup接口的话,内核是如何默认初始化meminfo结构的?
linuxmake menuconfig中,我们曾经设置过boot option中的参数mem = 64M,这个时候会把64M传递给这个函数。
static int __init early_mem(char *p)
{

static int usermem __initdata = 0;


unsigned long size, start;


char *endp;



/*



* If the user specifies memory size, we



* blow away any automatically generated



* size.



*/


if (usermem == 0) {


usermem = 1;


meminfo.nr_banks = 0;


}



start = PHYS_OFFSET;


size
= memparse(p, &endp);


if (*endp == '@')


start = memparse(endp + 1, NULL);



arm_add_memory(start, size);


注释:
static int __init arm_add_memory(unsigned long start, unsigned long size)
{

struct membank *bank = &meminfo.bank[meminfo.nr_banks];



if (meminfo.nr_banks >= NR_BANKS) {


printk(KERN_CRIT "NR_BANKS too low, "


"ignoring memory at %#lx\n", start);



return -EINVAL;


}



/*



* Ensure that start/size are aligned to a page boundary.



* Size is appropriately rounded down, start is rounded up.



*/


size -= start & ~PAGE_MASK;


bank->start = PAGE_ALIGN(start);


bank->size
= size & PAGE_MASK;



/*



* Check whether this memory region has non-zero size or



* invalid node number.



*/


if (bank->size == 0)


return -EINVAL;



meminfo.nr_banks++;


return 0;

}


return 0;

}
early_param("mem", early_mem);
1.3.2.
物理内存块数据结构初始化Memblock结构体是被meminfo初始化,描述那些内存被保留,哪些可以被动态使用。

#define MAX_MEMBLOCK_REGIONS 128

struct memblock_property {

u64 base;


u64 size;

};

struct memblock_region {

unsigned long cnt;


u64 size;


struct memblock_property region[MAX_MEMBLOCK_REGIONS+1];

};

struct memblock {

unsigned long debug;


u64 rmo_size;


struct memblock_region memory;


struct memblock_region reserved;

};

extern struct memblock memblock;


void __init arm_memblock_init(struct meminfo *mi, struct machine_desc *mdesc)
{

int i;



memblock_init();

注释:该函数初始化了memblock_property
region[0]


void __init memblock_init(void)
{

/* Create a dummy zero size MEMBLOCK which will get coalesced away later.



* This simplifies the memblock_add() code below...



*/


memblock.memory.region[0].base = 0;


memblock.memory.region[0].size = 0;


memblock.memory.cnt = 1;


/* Ditto. */


memblock.reserved.region[0].base = 0;


memblock.reserved.region[0].size = 0;


memblock.reserved.cnt = 1;

}


for (i = 0; i < mi->nr_banks; i++)


memblock_add(mi->bank.start, mi->bank.size);



/* Register the kernel text, kernel data and initrd with memblock. */

#ifdef CONFIG_XIP_KERNEL

memblock_reserve(__pa(_data), _end - _data);

#else

memblock_reserve(__pa(_stext), _end - _stext);

注释:
kernel的代码段和数据段在内存的保留中设置为保留,这段内存不能被动态分配使用。
#endif
#ifdef CONFIG_BLK_DEV_INITRD

if (phys_initrd_size) {


memblock_reserve(phys_initrd_start, phys_initrd_size);

注释:
保留存储initrd的那段内存不被别的程序使用。

/* Now convert initrd to virtual addresses */


initrd_start = __phys_to_virt(phys_initrd_start);


initrd_end = initrd_start + phys_initrd_size;


}

#endif


arm_mm_memblock_reserve();

注释:保留存储内核页表的那段内存不被分配,起始地址为0x4000,大小共16K字节。
void __init arm_mm_memblock_reserve(void)
{

/*



* Reserve the page tables.
These are already in use,



* and can only be in node 0.



*/


memblock_reserve(__pa(swapper_pg_dir), PTRS_PER_PGD * sizeof(pgd_t));


#ifdef CONFIG_SA1111

/*



* Because of the SA1111 DMA bug, we want to preserve our



* precious DMA-able memory...



*/


memblock_reserve(PHYS_OFFSET, __pa(swapper_pg_dir) - PHYS_OFFSET);

#endif
}

/* reserve any platform specific memblock areas */


if (mdesc->reserve)


mdesc->reserve();

注释:如果在arch/arm/mach-xxx/下面有设置自己的reserve接口,则执行特定的reserve函数。


memblock_analyze();


memblock_dump_all();

}

2.
Linux 中的分页Linux采用了一种同时适用于32bit64bit系统的普通分页模型。从2.6.11版本开始使用四级分页模型。

2.1.
页全局目录(Page Global Directory, PGD
2.2.
页上级目录 (Page Upper Directory, PUD)
2.3.
页中间目录 (Page Middle Directory, PMD)
2.4.
页表 (Page Table)




#define PTRS_PER_PTE
512

#define PTRS_PER_PMD
1

#define PTRS_PER_PGD
2048


#define PMD_SHIFT
21

#define PGDIR_SHIFT
21


static inline void prepare_page_table(void)
{

unsigned long addr;



/*



* Clear out all the mappings below the kernel image.



*/


for (addr = 0; addr < MODULES_VADDR; addr += PGDIR_SIZE)


pmd_clear(pmd_off_k(addr));

注释:

/* Find an entry in the second-level page table.. */
#define pmd_offset(dir, addr)
((pmd_t *)(dir))

static inline pmd_t *pmd_off(pgd_t *pgd, unsigned long virt)
{

return pmd_offset(pgd, virt);

}

static inline pmd_t *pmd_off_k(unsigned long virt)
{

return pmd_off(
pgd_offset_k(virt), virt);

}

/* to find an entry in a kernel page-table-directory */
#define pgd_offset_k(addr)
pgd_offset(&init_mm, addr)

init_mm中的变量pgd的值,为页目录基地址。

/* to find an entry in a page-table-directory */
#define pgd_index(addr)
((addr) >> PGDIR_SHIFT)


#define pgd_offset(mm, addr)
((mm)->pgd+pgd_index(addr))

取虚拟地址addr对应的页全局目录项的地址。


#define pmd_clear(pmdp)
\


do {
\


pmdp[0] = __pmd(0);
\


pmdp[1] = __pmd(0);
\

清除两个字单元8字节。由前面的那个图我们知道,事实上linux内核的页目录项存储是8字节单元的,而不是传统我们认为的4字节。

clean_pmd_entry(pmdp);
\


} while (0)


这段code用图表演示:



#ifdef CONFIG_XIP_KERNEL

/* The XIP kernel is mapped in the module area -- skip over it */


addr = ((unsigned long)_etext + PGDIR_SIZE - 1) & PGDIR_MASK;

#endif

for ( ; addr < PAGE_OFFSET; addr += PGDIR_SIZE)


pmd_clear(pmd_off_k(addr));

注释:
清除从MODULES_VADDR开始到0xc0000000的内存地址对应的页目录项。


/*



* Clear out all the kernel space mappings, except for the first



* memory bank, up to the end of the vmalloc region.



*/


for (addr = __phys_to_virt(bank_phys_end(&meminfo.bank[0]));



addr < VMALLOC_END; addr += PGDIR_SIZE)


pmd_clear(pmd_off_k(addr));

}

注释:清除非连续内存区的线性地址的内核映射目录项,内存区间见下图红色部分。


综合上面所描述,这个prepare_page_table函数执行后,将会有下图灰色空间部分的内存在swapper_pg_dir内核页表中对应的目录项被清零。

但是他保留了与实际物理内存一一对应的线性地址对应的页表目录项,也就保留了kernel在初始化的时候的map映射。


static inline void map_memory_bank(struct membank *bank)
{

struct map_desc map;



map.pfn = bank_pfn_start(bank);


map.virtual = __phys_to_virt(bank_phys_start(bank));


map.length = bank_phys_size(bank);


map.type = MT_MEMORY;



create_mapping(&map);

}

static void __init map_lowmem(void)
{

struct meminfo *mi = &meminfo;


int i;



/* Map all the lowmem memory banks. */


for (i = 0; i < mi->nr_banks; i++) {


struct membank *bank = &mi->bank;



if (!bank->highmem)


map_memory_bank(bank);


}

}
注释:把物理内存一对一map0xc0000000开始的内核虚拟地址空间。



static void __init devicemaps_init(struct machine_desc *mdesc)
{

struct map_desc map;


unsigned long addr;


void *vectors;



/*



* Allocate the vector page early.



*/


vectors = early_alloc(PAGE_SIZE);

注释:
分配一页物理内存并转化为虚拟地址,给Vectors.

for (addr = VMALLOC_END; addr; addr += PGDIR_SIZE)


pmd_clear(pmd_off_k(addr));

注释:
清除VMALLOC_END开始的虚拟地址空间对应的页目录项,灰色部分对应。



/*



* Map the kernel if it is XIP.



* It is always first in the modulearea.



*/

#ifdef CONFIG_XIP_KERNEL

map.pfn = __phys_to_pfn(CONFIG_XIP_PHYS_ADDR & SECTION_MASK);


map.virtual = MODULES_VADDR;


map.length = ((unsigned long)_etext - map.virtual + ~SECTION_MASK) & SECTION_MASK;


map.type = MT_ROM;


create_mapping(&map);

#endif


/*



* Map the cache flushing regions.



*/

#ifdef FLUSH_BASE

map.pfn = __phys_to_pfn(FLUSH_BASE_PHYS);


map.virtual = FLUSH_BASE;


map.length = SZ_1M;


map.type = MT_CACHECLEAN;


create_mapping(&map);

#endif
#ifdef FLUSH_BASE_MINICACHE

map.pfn = __phys_to_pfn(FLUSH_BASE_PHYS + SZ_1M);


map.virtual = FLUSH_BASE_MINICACHE;


map.length = SZ_1M;


map.type = MT_MINICLEAN;


create_mapping(&map);

#endif


/*



* Create a mapping for the machine vectors at the high-vectors



* location (0xffff0000).
If we aren't using high-vectors, also



* create a mapping at the low-vectors virtual address.



*/


map.pfn = __phys_to_pfn(virt_to_phys(vectors));


map.virtual = 0xffff0000;


map.length = PAGE_SIZE;


map.type = MT_HIGH_VECTORS;


create_mapping(&map);

注释:
把刚刚分配的一页物理内存地址map0xffff0000开始的一页空间处,作为arm的高端中断向量的空间。

if (!vectors_high()) {


map.virtual = 0;


map.type = MT_LOW_VECTORS;


create_mapping(&map);


}



/*



* Ask the machine support to map in the statically mapped devices.



*/


if (mdesc->map_io)



mdesc->map_io();

注释:
Mach-xxx.c文件我们已经设置好了私有的map_io函数,在这里调用。

/*



* Finally flush the caches and tlb to ensure that we're in a



* consistent state wrt the writebuffer.
This also ensures that



* any write-allocated cache lines in the vector page are written



* back.
After this point, we can start to touch devices again.



*/


local_flush_tlb_all();


flush_cache_all();

}



void __init bootmem_init(void)
{

struct meminfo *mi = &meminfo;


unsigned long min, max_low, max_high;



max_low = max_high = 0;



find_limits(mi, &min, &max_low, &max_high);


static void __init find_limits(struct meminfo *mi,

unsigned long *min, unsigned long *max_low, unsigned long *max_high)

{

int i;



*min = -1UL;


*max_low = *max_high = 0;



for_each_bank (i, mi) {


struct membank *bank = &mi->bank;


unsigned long start, end;



start = bank_pfn_start(bank);


end = bank_pfn_end(bank);



if (*min > start)


*min = start;


if (*max_high < end)


*max_high = end;


if (bank->highmem)


continue;


if (*max_low < end)



*max_low = end;


}

}


arm_bootmem_init(mi, min, max_low);



/*



* Sparsemem tries to allocate bootmem in memory_present(),



* so must be done after the fixed reservations



*/


arm_memory_present();



/*



* sparse_init() needs the bootmem allocator up and running.



*/


sparse_init();



/*



* Now free the memory - free_area_init_node needs



* the sparse mem_map arrays initialized by sparse_init()



* for memmap_init_zone(), otherwise all PFNs are invalid.



*/


arm_bootmem_free(mi, min, max_low, max_high);



high_memory = __va((max_low << PAGE_SHIFT) - 1) + 1;



/*



* This doesn't seem to be used by the Linux memory manager any



* more, but is used by ll_rw_block.
If we can get rid of it, we



* also get rid of some of the stuff above as well.



*



* Note: max_low_pfn and max_pfn reflect the number of _pages_ in



* the system, not the maximum PFN.



*/


max_low_pfn = max_low - PHYS_PFN_OFFSET;


max_pfn = max_high - PHYS_PFN_OFFSET;

}

3.
非一致内存访问一致内存概念:
计算机内存是一种均匀、共享的资源。在忽略硬件高速缓存的作用下,我们期望不管内存单元处于何处,也不管CPU处于何处,CPU对内存的访问都需要相同的时间。

非符合以上描述的计算机为非一致性内存访问

Linux2.6对这种非一致性内存的访问支持的方式为,系统的物理内存划分为几个不同的节点node,在一个单独的节点内任一给定的CPU对其的访问时间都是相同的。

节点node的描述符:pg_data_t

typedef struct pglist_data {

struct zone node_zones[MAX_NR_ZONES];


struct zonelist node_zonelists[MAX_ZONELISTS];


int nr_zones;

#ifdef CONFIG_FLAT_NODE_MEM_MAP
/* means !SPARSEMEM */


struct page *node_mem_map;

#ifdef CONFIG_CGROUP_MEM_RES_CTLR

struct page_cgroup *node_page_cgroup;

#endif
#endif
#ifndef CONFIG_NO_BOOTMEM

struct bootmem_data *bdata;

#endif
#ifdef CONFIG_MEMORY_HOTPLUG

/*



* Must be held any time you expect node_start_pfn, node_present_pages



* or node_spanned_pages stay constant.
Holding this will also



* guarantee that any pfn_valid() stays that way.



*



* Nests above zone->lock and zone->size_seqlock.



*/


spinlock_t node_size_lock;

#endif

unsigned long node_start_pfn;


unsigned long node_present_pages; /* total number of physical pages */


unsigned long node_spanned_pages; /* total size of physical page



range, including holes */


int node_id;


wait_queue_head_t kswapd_wait;


struct task_struct *kswapd;


int kswapd_max_order;

} pg_data_t;

即使我们的CPU使用一致内存访问,NUMA没有被编译进内核,Linux还是使用节点,不过这是一个单独的节点,它包含了系统的所有物理内存,这个节点被linux2.6定义为contig_page_data

#ifndef CONFIG_NEED_MULTIPLE_NODES
struct pglist_data __refdata contig_page_data = {
#ifndef CONFIG_NO_BOOTMEM

.bdata = &bootmem_node_data[0]

#endif

typedef struct bootmem_data {

unsigned long node_min_pfn;


unsigned long node_low_pfn;


void *node_bootmem_map;


unsigned long last_end_off;


unsigned long hint_idx;


struct list_head list;

} bootmem_data_t;
extern bootmem_data_t bootmem_node_data[];

mmu.c中有paging_init函数调用bootmem_init函数,bootmem_init这个函数又调用arm_bootmem_init函数,在此函数中有code


pgdat = NODE_DATA(0);

注释:
#define NODE_DATA(nid)
(&contig_page_data)
//取内存节点的地址,赋值给指针pgdat


init_bootmem_node(pgdat, __phys_to_pfn(bitmap), start_pfn, end_pfn);

4.
内存管理区

enum zone_type {
#ifdef CONFIG_ZONE_DMA

/*



* ZONE_DMA is used when there are devices that are not able



* to do DMA to all of addressable memory (ZONE_NORMAL). Then we



* carve out the portion of memory that is needed for these devices.



* The range is arch specific.



*



* Some examples



*



* Architecture
Limit



* ---------------------------



* parisc, ia64, sparc
<4G



* s390
<2G



* arm
Various



* alpha
Unlimited or 0-16MB.



*



* i386, x86_64 and multiple other arches



*
<16M.



*/


ZONE_DMA,

#endif
#ifdef CONFIG_ZONE_DMA32

/*



* x86_64 needs two ZONE_DMAs because it supports devices that are



* only able to do DMA to the lower 16M but also 32 bit devices that



* can only do DMA areas below 4G.



*/


ZONE_DMA32,

#endif

/*



* Normal addressable memory is in ZONE_NORMAL. DMA operations can be



* performed on pages in ZONE_NORMAL if the DMA devices support



* transfers to all addressable memory.



*/


ZONE_NORMAL,

#ifdef CONFIG_HIGHMEM

/*



* A memory area that is only addressable by the kernel through



* mapping portions into its own address space. This is for example



* used by i386 to allow the kernel to address the memory beyond



* 900MB. The kernel will set up special mappings (page



* table entries on i386) for each page that the kernel needs to



* access.



*/


ZONE_HIGHMEM,

#endif

ZONE_MOVABLE,


__MAX_NR_ZONES

};


有的CPU并不是整个物理内存空间都可以作为DMA的空间,所以ZONE_DMA
的地址空间取决于特定的架构,如下表:



* Architecture

Limit



* ----------------------------------------------------------------------



* parisc, ia64, sparc

<4G



* s390


<2G



* arm


Various



* alpha

Unlimited or 0-16MB.



* i386, x86_64 and multiple other arches
<16M.



ARM任何可寻址的内存都可以做DMA用途,所以我们在内核里没有配置CONFIG_ZONE_DMA,此外如果你的内存大于1g则必须配置CONFIG_HIGHMEM

struct zone {

/* Fields commonly accessed by the page allocator */



/* zone watermarks, access with *_wmark_pages(zone) macros */


unsigned long watermark[NR_WMARK];



/*



* We don't know if the memory that we're going to allocate will be freeable



* or/and it will be released eventually, so to avoid totally wasting several



* GB of ram we must reserve some of the lower zone memory (otherwise we risk



* to run OOM on the lower zones despite there's tons of freeable ram



* on the higher zones). This array is recalculated at runtime if the



* sysctl_lowmem_reserve_ratio sysctl changes.



*/


unsigned long
lowmem_reserve[MAX_NR_ZONES];


#ifdef CONFIG_NUMA

int node;


/*



* zone reclaim becomes active if more unmapped pages exist.



*/


unsigned long
min_unmapped_pages;


unsigned long
min_slab_pages;

#endif

struct per_cpu_pageset __percpu *pageset;


/*



* free areas of different sizes



*/


spinlock_t
lock;


int
all_unreclaimable; /* All pages pinned */

#ifdef CONFIG_MEMORY_HOTPLUG

/* see spanned/present_pages for more description */


seqlock_t
span_seqlock;

#endif

struct free_area
free_area[MAX_ORDER];


#ifndef CONFIG_SPARSEMEM

/*



* Flags for a pageblock_nr_pages block. See pageblock-flags.h.



* In SPARSEMEM, this map is stored in struct mem_section



*/


unsigned long
*pageblock_flags;

#endif /* CONFIG_SPARSEMEM */

#ifdef CONFIG_COMPACTION

/*



* On compaction failure, 1<<compact_defer_shift compactions



* are skipped before trying again. The number attempted since



* last failure is tracked with compact_considered.



*/


unsigned int
compact_considered;


unsigned int
compact_defer_shift;

#endif


ZONE_PADDING(_pad1_)



/* Fields commonly accessed by the page reclaim scanner */


spinlock_t
lru_lock;


struct zone_lru {



struct list_head list;


} lru[NR_LRU_LISTS];



struct zone_reclaim_stat reclaim_stat;



unsigned long
pages_scanned;

/* since last reclaim */


unsigned long
flags;

/* zone flags, see below */



/* Zone statistics */


atomic_long_t
vm_stat[NR_VM_ZONE_STAT_ITEMS];



/*



* prev_priority holds the scanning priority for this zone.
It is



* defined as the scanning priority at which we achieved our reclaim



* target at the previous try_to_free_pages() or balance_pgdat()



* invocation.



*



* We use prev_priority as a measure of how much stress page reclaim is



* under - it drives the swappiness decision: whether to unmap mapped



* pages.



*



* Access to both this field is quite racy even on uniprocessor.
But



* it is expected to average out OK.



*/


int prev_priority;



/*



* The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on



* this zone's LRU.
Maintained by the pageout code.



*/


unsigned int inactive_ratio;




ZONE_PADDING(_pad2_)


/* Rarely used or read-mostly fields */



/*



* wait_table
-- the array holding the hash table



* wait_table_hash_nr_entries
-- the size of the hash table array



* wait_table_bits
-- wait_table_size == (1 << wait_table_bits)



*



* The purpose of all these is to keep track of the people



* waiting for a page to become available and make them



* runnable again when possible. The trouble is that this



* consumes a lot of space, especially when so few things



* wait on pages at a given time. So instead of using



* per-page waitqueues, we use a waitqueue hash table.



*



* The bucket discipline is to sleep on the same queue when



* colliding and wake all in that wait queue when removing.



* When something wakes, it must check to be sure its page is



* truly available, a la thundering herd. The cost of a



* collision is great, but given the expected load of the



* table, they should be so rare as to be outweighed by the



* benefits from the saved space.



*



* __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the



* primary users of these fields, and in mm/page_alloc.c



* free_area_init_core() performs the initialization of them.



*/


wait_queue_head_t
* wait_table;


unsigned long
wait_table_hash_nr_entries;


unsigned long
wait_table_bits;



/*



* Discontig memory support fields.



*/


struct pglist_data
*zone_pgdat;


/* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */


unsigned long
zone_start_pfn;



/*



* zone_start_pfn, spanned_pages and present_pages are all



* protected by span_seqlock.
It is a seqlock because it has



* to be read outside of zone->lock, and it is done in the main



* allocator path.
But, it is written quite infrequently.



*



* The lock is declared along with zone->lock because it is



* frequently read in proximity to zone->lock.
It's good to



* give them a chance of being in the same cacheline.



*/


unsigned long
spanned_pages;
/* total size, including holes */


unsigned long
present_pages;
/* amount of memory (excluding holes) */



/*



* rarely used fields:



*/


const char
*name;

} ____cacheline_internodealigned_in_smp;


5.
高端内存页框的内核映射Arch/arm/init.c中有void __init bootmem_init(void)函数
初始化了high_memory 变量
high_memory = __va((max_low << PAGE_SHIFT) - 1) + 1;


高端内存的页框分配只能通过alloc_pages()函数和他的快捷函数alloc_page()因为它不返回线性地址。
内核采用三种不同的机制映射高端内存,分别为:永久内存映射,临时内存映射和非连续内存映射。

伙伴系统算法:



(1)
请求一个23次方个连续页的框(8×4kbyte=32kbyte),也就是说请求一个32byte大小的框。如果在第三行的链表中有找到那么就分配一个框给用户。
(2)
如果在第3行中无法找到这么大小的框,那么就查找第4行,如果找不到就继续查找第5行,以此类推直到到达1024行为止。
(3)
如果在第4行中可以找到64kbyte大小的框,则内核把64k分成两等分,一半用作满足请求另一半插入到第3行的链表当中。
(4)
如果第4行中未找到,则继续找第5行,如果第5行中有找到128k字节的框,那么分出32k给用户,剩余的96k中拿出64k插入到第4行,剩余的32k插入到第3行。

内核试图把一个大小为32k字节的框合并为64k字节的框,如果:
(1)
两个框具有相同的大小(32k
(2)
他们的物理地址是连续的
(3)
第一个框的第一个页框的物理地址是64k的倍数。
我们称这两个框为伙伴。

请下载完整版:

[hide][/hide]
附件: 您需要登录才可以下载或查看附件。没有帐号?本站只开放邀请码注册,QQ:82475491,索要邀请码
分享到: QQ空间QQ空间 腾讯微博腾讯微博 腾讯朋友腾讯朋友

ding

ding yi xia

TOP

不错的资料呀!!!

TOP

佩服.....佩服.....这种东西很少有人愿意拿出来分的~~

TOP

学习学习

TOP

顶一下..啊 ..顶一下

TOP

学习,学习!!!!!!!!!!!!!!!!

TOP

回复支持,

TOP

内存机制,不错啊

TOP

回复 1# osboy


    顶一下看看

TOP

好东西,我找了很久了,找了几个论坛了

TOP

学习了。

TOP

谢谢!谢谢!谢谢!谢谢!谢谢!

TOP

回复 1# osboy


    TKS

TOP

看看,THX!   !

TOP

活雷锋啊

TOP

回复 1# osboy


    pdf排版的比较好

TOP

非常感谢.....

TOP

感谢lz................

TOP

看看是什么好东东

TOP

学习啦,看看

TOP

回复 1# osboy


    支持一下

TOP

学习,学习,学习……

TOP

楼主有心了,帮顶一下

TOP

顶~~~~~~~~~支持lz!!

TOP

不错的资料啊,赞一个

TOP

这些还是不了解,只能先看看。

TOP

下载来看看

TOP

楼主辛苦了, 感谢分享.

TOP

新教程出炉了吗?

TOP

返回列表
网页右侧QQ悬浮滚动在线客服
网页右侧QQ悬浮滚动在线客服