summaryrefslogtreecommitdiff
path: root/arch
diff options
context:
space:
mode:
authorCliff Wickman <cpw@sgi.com>2015-10-26 23:20:18 +0900
committerAtsushi Kumagai <ats-kumagai@wm.jp.nec.com>2015-10-28 17:52:02 +0900
commit46176a97f329904c10d4efe5582594d55a04339c (patch)
treec6dca5885e7c6220e8f05aa7bc2e14a574482f62 /arch
parent5f70ac4151d7f2a2f04de596c438174316886aca (diff)
[PATCH V7] Exclude page structures of non-dumped pages.
This patch adds a -e option to makedumpfile. The -e option excludes kernel pages that contain nothing but kernel page structures for pages that are not being included in the dump. The -e option only works in non-cyclic mode, which its use implies. And only applies to the x86_64 architecture. The -e requires the use of --work-dir, as it will create a pfn file in that work directory. The --work-dir should probably be set up by the distro procedures which determine the mount point of the root device. This patch formerly applied after patch: [PATCH V2] makedumpfile: make --work-dir easier to use but now it stands alone. I have tested on large memory systems to demonstrate the importance of this feature to such systems. See some numbers below. The most dramatic demonstration was on a 32TB system where the patch reduced the process from 2 hours to 26 minutes. The size of the dump would probably have been over 30GB (but I ran out of disk space). It was reduced to 5.4GB. A page structure (56 bytes) exists for every 4096-byte page. This amounts to 3.67 million pages, or about 14GB, per terabyte of system memory! Without -e an idle 2-terabyte system can be dumped (compressed) to a file of about 3.6G. With -e that is reduced to about 456M. And the time and space savings multiply for each additional terabyte of memory in the system. Experimental time/size results: (basically idle systems) Memory Size With -e Without -e (sec.) (sec.) (using a sles11sp3 kernel that does not provide mmap of /proc/vmcore:) 1TB 52 244M 257 1.7G 2TB 128 456M 526 3.6G 8TB 780 1.6G 3400 13.8G 16TB 2600 3.1G 9800 (extrapolated, 2:40 is too long to wait) (using a sles11sp3 kernel that provides mmap of /proc/vmcore:) 16TB 900 3.8G not done 32TB 6000 5.4G not done (using a sles11sp3 kernel that provides mmap of /proc/vmcore:) 32TB 1600 5.4G 7300 (extrapolated) (ran out of 19G space before 1/2 done) The only disadvantage is that various options of the crash 'kmem' command (that walk lists of page structures) will not work. Version 7.0.9 of crash is already patched to issue a warning about such commands when the dump is flagged DUMP_DH_EXCLUDED_VMEMMAP. Sorry that this patch is large. The vmemmap page scan is done by some very large functions, and they are all interrelated. I didn't see any point to breaking them into several inter-dependent patches. Signed-off-by: Cliff Wickman <cpw@sgi.com>
Diffstat (limited to 'arch')
-rw-r--r--arch/x86_64.c307
1 files changed, 307 insertions, 0 deletions
diff --git a/arch/x86_64.c b/arch/x86_64.c
index 4788f55..d9765a0 100644
--- a/arch/x86_64.c
+++ b/arch/x86_64.c
@@ -18,6 +18,8 @@
#include "../print_info.h"
#include "../elf_info.h"
#include "../makedumpfile.h"
+extern struct vmap_pfns *gvmem_pfns;
+extern int nr_gvmem_pfns;
int
is_vmalloc_addr_x86_64(ulong vaddr)
@@ -460,5 +462,310 @@ int get_xen_info_x86_64(void)
return TRUE;
}
+/*
+ * Scan the kernel page table for the pfn's of the page structs
+ * Place them in array gvmem_pfns[nr_gvmem_pfns]
+ */
+int
+find_vmemmap_x86_64()
+{
+ int i;
+ int pgd_index, pud_index;
+ int start_range = 1;
+ int num_pmds=0, num_pmds_valid=0;
+ int break_in_valids, break_after_invalids;
+ int do_break, done = 0;
+ int last_valid=0, last_invalid=0;
+ int pagestructsize, structsperhpage, hugepagesize;
+ long page_structs_per_pud;
+ long num_puds, groups = 0;
+ long pgdindex, pudindex, pmdindex;
+ long vaddr, vaddr_base;
+ long rep_pfn_start = 0, rep_pfn_end = 0;
+ unsigned long init_level4_pgt;
+ unsigned long max_paddr, high_pfn;
+ unsigned long pgd_addr, pud_addr, pmd_addr;
+ unsigned long *pgdp, *pudp, *pmdp;
+ unsigned long pud_page[PTRS_PER_PUD];
+ unsigned long pmd_page[PTRS_PER_PMD];
+ unsigned long vmap_offset_start = 0, vmap_offset_end = 0;
+ unsigned long pmd, tpfn;
+ unsigned long pvaddr = 0;
+ unsigned long data_addr = 0, last_data_addr = 0, start_data_addr = 0;
+ /*
+ * data_addr is the paddr of the page holding the page structs.
+ * We keep lists of contiguous pages and the pfn's that their
+ * page structs represent.
+ * start_data_addr and last_data_addr mark start/end of those
+ * contiguous areas.
+ * An area descriptor is vmap start/end pfn and rep start/end
+ * of the pfn's represented by the vmap start/end.
+ */
+ struct vmap_pfns *vmapp, *vmaphead = NULL, *cur, *tail;
+
+ init_level4_pgt = SYMBOL(init_level4_pgt);
+ if (init_level4_pgt == NOT_FOUND_SYMBOL) {
+ ERRMSG("init_level4_pgt not found\n");
+ return FAILED;
+ }
+ pagestructsize = size_table.page;
+ hugepagesize = PTRS_PER_PMD * info->page_size;
+ vaddr_base = info->vmemmap_start;
+ vaddr = vaddr_base;
+ max_paddr = get_max_paddr();
+ /*
+ * the page structures are mapped at VMEMMAP_START (info->vmemmap_start)
+ * for max_paddr >> 12 page structures
+ */
+ high_pfn = max_paddr >> 12;
+ pgd_index = pgd4_index(vaddr_base);
+ pud_index = pud_index(vaddr_base);
+ pgd_addr = vaddr_to_paddr(init_level4_pgt); /* address of pgd */
+ pgd_addr += pgd_index * sizeof(unsigned long);
+ page_structs_per_pud = (PTRS_PER_PUD * PTRS_PER_PMD * info->page_size) /
+ pagestructsize;
+ num_puds = (high_pfn + page_structs_per_pud - 1) / page_structs_per_pud;
+ pvaddr = VMEMMAP_START;
+ structsperhpage = hugepagesize / pagestructsize;
+
+ /* outer loop is for pud entries in the pgd */
+ for (pgdindex = 0, pgdp = (unsigned long *)pgd_addr; pgdindex < num_puds;
+ pgdindex++, pgdp++) {
+ /* read the pgd one word at a time, into pud_addr */
+ if (!readmem(PADDR, (unsigned long long)pgdp, (void *)&pud_addr,
+ sizeof(unsigned long))) {
+ ERRMSG("Can't get pgd entry for slot %d.\n", pgd_index);
+ return FAILED;
+ }
+ /* mask the pgd entry for the address of the pud page */
+ pud_addr &= PMASK;
+ /* read the entire pud page */
+ if (!readmem(PADDR, (unsigned long long)pud_addr, (void *)pud_page,
+ PTRS_PER_PUD * sizeof(unsigned long))) {
+ ERRMSG("Can't get pud entry for pgd slot %ld.\n", pgdindex);
+ return FAILED;
+ }
+ /* step thru each pmd address in the pud page */
+ /* pudp points to an entry in the pud page */
+ for (pudp = (unsigned long *)pud_page, pudindex = 0;
+ pudindex < PTRS_PER_PUD; pudindex++, pudp++) {
+ pmd_addr = *pudp & PMASK;
+ /* read the entire pmd page */
+ if (!readmem(PADDR, pmd_addr, (void *)pmd_page,
+ PTRS_PER_PMD * sizeof(unsigned long))) {
+ ERRMSG("Can't get pud entry for slot %ld.\n", pudindex);
+ return FAILED;
+ }
+ /* pmdp points to an entry in the pmd */
+ for (pmdp = (unsigned long *)pmd_page, pmdindex = 0;
+ pmdindex < PTRS_PER_PMD; pmdindex++, pmdp++) {
+ /* linear page position in this page table: */
+ pmd = *pmdp;
+ num_pmds++;
+ tpfn = (pvaddr - VMEMMAP_START) /
+ pagestructsize;
+ if (tpfn >= high_pfn) {
+ done = 1;
+ break;
+ }
+ /*
+ * vmap_offset_start:
+ * Starting logical position in the
+ * vmemmap array for the group stays
+ * constant until a hole in the table
+ * or a break in contiguousness.
+ */
+
+ /*
+ * Ending logical position in the
+ * vmemmap array:
+ */
+ vmap_offset_end += hugepagesize;
+ do_break = 0;
+ break_in_valids = 0;
+ break_after_invalids = 0;
+ /*
+ * We want breaks either when:
+ * - we hit a hole (invalid)
+ * - we discontiguous page is a string of valids
+ */
+ if (pmd) {
+ data_addr = (pmd & PMASK);
+ if (start_range) {
+ /* first-time kludge */
+ start_data_addr = data_addr;
+ last_data_addr = start_data_addr
+ - hugepagesize;
+ start_range = 0;
+ }
+ if (last_invalid) {
+ /* end of a hole */
+ start_data_addr = data_addr;
+ last_data_addr = start_data_addr
+ - hugepagesize;
+ /* trigger update of offset */
+ do_break = 1;
+ }
+ last_valid = 1;
+ last_invalid = 0;
+ /*
+ * we have a gap in physical
+ * contiguousness in the table.
+ */
+ /* ?? consecutive holes will have
+ same data_addr */
+ if (data_addr !=
+ last_data_addr + hugepagesize) {
+ do_break = 1;
+ break_in_valids = 1;
+ }
+ DEBUG_MSG("valid: pud %ld pmd %ld pfn %#lx"
+ " pvaddr %#lx pfns %#lx-%lx"
+ " start %#lx end %#lx\n",
+ pudindex, pmdindex,
+ data_addr >> 12,
+ pvaddr, tpfn,
+ tpfn + structsperhpage - 1,
+ vmap_offset_start,
+ vmap_offset_end);
+ num_pmds_valid++;
+ if (!(pmd & _PAGE_PSE)) {
+ printf("vmemmap pmd not huge, abort\n");
+ return FAILED;
+ }
+ } else {
+ if (last_valid) {
+ /* this a hole after some valids */
+ do_break = 1;
+ break_in_valids = 1;
+ break_after_invalids = 0;
+ }
+ last_valid = 0;
+ last_invalid = 1;
+ /*
+ * There are holes in this sparsely
+ * populated table; they are 2MB gaps
+ * represented by null pmd entries.
+ */
+ DEBUG_MSG("invalid: pud %ld pmd %ld %#lx"
+ " pfns %#lx-%lx start %#lx end"
+ " %#lx\n", pudindex, pmdindex,
+ pvaddr, tpfn,
+ tpfn + structsperhpage - 1,
+ vmap_offset_start,
+ vmap_offset_end);
+ }
+ if (do_break) {
+ /* The end of a hole is not summarized.
+ * It must be the start of a hole or
+ * hitting a discontiguous series.
+ */
+ if (break_in_valids || break_after_invalids) {
+ /*
+ * calculate that pfns
+ * represented by the current
+ * offset in the vmemmap.
+ */
+ /* page struct even partly on this page */
+ rep_pfn_start = vmap_offset_start /
+ pagestructsize;
+ /* ending page struct entirely on
+ this page */
+ rep_pfn_end = ((vmap_offset_end -
+ hugepagesize) / pagestructsize);
+ DEBUG_MSG("vmap pfns %#lx-%lx "
+ "represent pfns %#lx-%lx\n\n",
+ start_data_addr >> PAGESHIFT(),
+ last_data_addr >> PAGESHIFT(),
+ rep_pfn_start, rep_pfn_end);
+ groups++;
+ vmapp = (struct vmap_pfns *)malloc(
+ sizeof(struct vmap_pfns));
+ /* pfn of this 2MB page of page structs */
+ vmapp->vmap_pfn_start = start_data_addr
+ >> PTE_SHIFT;
+ vmapp->vmap_pfn_end = last_data_addr
+ >> PTE_SHIFT;
+ /* these (start/end) are literal pfns
+ * on this page, not start and end+1 */
+ vmapp->rep_pfn_start = rep_pfn_start;
+ vmapp->rep_pfn_end = rep_pfn_end;
+
+ if (!vmaphead) {
+ vmaphead = vmapp;
+ vmapp->next = vmapp;
+ vmapp->prev = vmapp;
+ } else {
+ tail = vmaphead->prev;
+ vmaphead->prev = vmapp;
+ tail->next = vmapp;
+ vmapp->next = vmaphead;
+ vmapp->prev = tail;
+ }
+ }
+
+ /* update logical position at every break */
+ vmap_offset_start =
+ vmap_offset_end - hugepagesize;
+ start_data_addr = data_addr;
+ }
+
+ last_data_addr = data_addr;
+ pvaddr += hugepagesize;
+ /*
+ * pvaddr is current virtual address
+ * eg 0xffffea0004200000 if
+ * vmap_offset_start is 4200000
+ */
+ }
+ }
+ tpfn = (pvaddr - VMEMMAP_START) / pagestructsize;
+ if (tpfn >= high_pfn) {
+ done = 1;
+ break;
+ }
+ }
+ rep_pfn_start = vmap_offset_start / pagestructsize;
+ rep_pfn_end = (vmap_offset_end - hugepagesize) / pagestructsize;
+ DEBUG_MSG("vmap pfns %#lx-%lx represent pfns %#lx-%lx\n\n",
+ start_data_addr >> PAGESHIFT(), last_data_addr >> PAGESHIFT(),
+ rep_pfn_start, rep_pfn_end);
+ groups++;
+ vmapp = (struct vmap_pfns *)malloc(sizeof(struct vmap_pfns));
+ vmapp->vmap_pfn_start = start_data_addr >> PTE_SHIFT;
+ vmapp->vmap_pfn_end = last_data_addr >> PTE_SHIFT;
+ vmapp->rep_pfn_start = rep_pfn_start;
+ vmapp->rep_pfn_end = rep_pfn_end;
+ if (!vmaphead) {
+ vmaphead = vmapp;
+ vmapp->next = vmapp;
+ vmapp->prev = vmapp;
+ } else {
+ tail = vmaphead->prev;
+ vmaphead->prev = vmapp;
+ tail->next = vmapp;
+ vmapp->next = vmaphead;
+ vmapp->prev = tail;
+ }
+ DEBUG_MSG("num_pmds: %d num_pmds_valid %d\n", num_pmds, num_pmds_valid);
+
+ /* transfer the linked list to an array */
+ cur = vmaphead;
+ gvmem_pfns = (struct vmap_pfns *)malloc(sizeof(struct vmap_pfns) * groups);
+ i = 0;
+ do {
+ vmapp = gvmem_pfns + i;
+ vmapp->vmap_pfn_start = cur->vmap_pfn_start;
+ vmapp->vmap_pfn_end = cur->vmap_pfn_end;
+ vmapp->rep_pfn_start = cur->rep_pfn_start;
+ vmapp->rep_pfn_end = cur->rep_pfn_end;
+ cur = cur->next;
+ free(cur->prev);
+ i++;
+ } while (cur != vmaphead);
+ nr_gvmem_pfns = i;
+ return COMPLETED;
+}
+
#endif /* x86_64 */