diff options
Diffstat (limited to 'scripts')
-rw-r--r-- | scripts/dump-guest-memory.py | 762 | ||||
-rwxr-xr-x | scripts/kvm/kvm_stat | 1199 |
2 files changed, 1159 insertions, 802 deletions
diff --git a/scripts/dump-guest-memory.py b/scripts/dump-guest-memory.py index 08796fff8c..f274bf80fa 100644 --- a/scripts/dump-guest-memory.py +++ b/scripts/dump-guest-memory.py @@ -1,39 +1,456 @@ -# This python script adds a new gdb command, "dump-guest-memory". It -# should be loaded with "source dump-guest-memory.py" at the (gdb) -# prompt. -# -# Copyright (C) 2013, Red Hat, Inc. -# -# Authors: -# Laszlo Ersek <lersek@redhat.com> -# -# This work is licensed under the terms of the GNU GPL, version 2 or later. See -# the COPYING file in the top-level directory. -# +""" +This python script adds a new gdb command, "dump-guest-memory". It +should be loaded with "source dump-guest-memory.py" at the (gdb) +prompt. + +Copyright (C) 2013, Red Hat, Inc. + +Authors: + Laszlo Ersek <lersek@redhat.com> + Janosch Frank <frankja@linux.vnet.ibm.com> + +This work is licensed under the terms of the GNU GPL, version 2 or later. See +the COPYING file in the top-level directory. +""" + +import ctypes + +UINTPTR_T = gdb.lookup_type("uintptr_t") + +TARGET_PAGE_SIZE = 0x1000 +TARGET_PAGE_MASK = 0xFFFFFFFFFFFFF000 + +# Special value for e_phnum. This indicates that the real number of +# program headers is too large to fit into e_phnum. Instead the real +# value is in the field sh_info of section 0. +PN_XNUM = 0xFFFF + +EV_CURRENT = 1 + +ELFCLASS32 = 1 +ELFCLASS64 = 2 + +ELFDATA2LSB = 1 +ELFDATA2MSB = 2 + +ET_CORE = 4 + +PT_LOAD = 1 +PT_NOTE = 4 + +EM_386 = 3 +EM_PPC = 20 +EM_PPC64 = 21 +EM_S390 = 22 +EM_AARCH = 183 +EM_X86_64 = 62 + +class ELF(object): + """Representation of a ELF file.""" + + def __init__(self, arch): + self.ehdr = None + self.notes = [] + self.segments = [] + self.notes_size = 0 + self.endianess = None + self.elfclass = ELFCLASS64 + + if arch == 'aarch64-le': + self.endianess = ELFDATA2LSB + self.elfclass = ELFCLASS64 + self.ehdr = get_arch_ehdr(self.endianess, self.elfclass) + self.ehdr.e_machine = EM_AARCH + + elif arch == 'aarch64-be': + self.endianess = ELFDATA2MSB + self.ehdr = get_arch_ehdr(self.endianess, self.elfclass) + self.ehdr.e_machine = EM_AARCH + + elif arch == 'X86_64': + self.endianess = ELFDATA2LSB + self.ehdr = get_arch_ehdr(self.endianess, self.elfclass) + self.ehdr.e_machine = EM_X86_64 + + elif arch == '386': + self.endianess = ELFDATA2LSB + self.elfclass = ELFCLASS32 + self.ehdr = get_arch_ehdr(self.endianess, self.elfclass) + self.ehdr.e_machine = EM_386 + + elif arch == 's390': + self.endianess = ELFDATA2MSB + self.ehdr = get_arch_ehdr(self.endianess, self.elfclass) + self.ehdr.e_machine = EM_S390 + + elif arch == 'ppc64-le': + self.endianess = ELFDATA2LSB + self.ehdr = get_arch_ehdr(self.endianess, self.elfclass) + self.ehdr.e_machine = EM_PPC64 + + elif arch == 'ppc64-be': + self.endianess = ELFDATA2MSB + self.ehdr = get_arch_ehdr(self.endianess, self.elfclass) + self.ehdr.e_machine = EM_PPC64 + + else: + raise gdb.GdbError("No valid arch type specified.\n" + "Currently supported types:\n" + "aarch64-be, aarch64-le, X86_64, 386, s390, " + "ppc64-be, ppc64-le") + + self.add_segment(PT_NOTE, 0, 0) + + def add_note(self, n_name, n_desc, n_type): + """Adds a note to the ELF.""" + + note = get_arch_note(self.endianess, len(n_name), len(n_desc)) + note.n_namesz = len(n_name) + 1 + note.n_descsz = len(n_desc) + note.n_name = n_name.encode() + note.n_type = n_type + + # Desc needs to be 4 byte aligned (although the 64bit spec + # specifies 8 byte). When defining n_desc as uint32 it will be + # automatically aligned but we need the memmove to copy the + # string into it. + ctypes.memmove(note.n_desc, n_desc.encode(), len(n_desc)) + + self.notes.append(note) + self.segments[0].p_filesz += ctypes.sizeof(note) + self.segments[0].p_memsz += ctypes.sizeof(note) + + def add_segment(self, p_type, p_paddr, p_size): + """Adds a segment to the elf.""" + + phdr = get_arch_phdr(self.endianess, self.elfclass) + phdr.p_type = p_type + phdr.p_paddr = p_paddr + phdr.p_filesz = p_size + phdr.p_memsz = p_size + self.segments.append(phdr) + self.ehdr.e_phnum += 1 + + def to_file(self, elf_file): + """Writes all ELF structures to the the passed file. + + Structure: + Ehdr + Segment 0:PT_NOTE + Segment 1:PT_LOAD + Segment N:PT_LOAD + Note 0..N + Dump contents + """ + elf_file.write(self.ehdr) + off = ctypes.sizeof(self.ehdr) + \ + len(self.segments) * ctypes.sizeof(self.segments[0]) + + for phdr in self.segments: + phdr.p_offset = off + elf_file.write(phdr) + off += phdr.p_filesz + + for note in self.notes: + elf_file.write(note) + + +def get_arch_note(endianess, len_name, len_desc): + """Returns a Note class with the specified endianess.""" + + if endianess == ELFDATA2LSB: + superclass = ctypes.LittleEndianStructure + else: + superclass = ctypes.BigEndianStructure + + len_name = len_name + 1 + + class Note(superclass): + """Represents an ELF note, includes the content.""" + + _fields_ = [("n_namesz", ctypes.c_uint32), + ("n_descsz", ctypes.c_uint32), + ("n_type", ctypes.c_uint32), + ("n_name", ctypes.c_char * len_name), + ("n_desc", ctypes.c_uint32 * ((len_desc + 3) // 4))] + return Note() + + +class Ident(ctypes.Structure): + """Represents the ELF ident array in the ehdr structure.""" + + _fields_ = [('ei_mag0', ctypes.c_ubyte), + ('ei_mag1', ctypes.c_ubyte), + ('ei_mag2', ctypes.c_ubyte), + ('ei_mag3', ctypes.c_ubyte), + ('ei_class', ctypes.c_ubyte), + ('ei_data', ctypes.c_ubyte), + ('ei_version', ctypes.c_ubyte), + ('ei_osabi', ctypes.c_ubyte), + ('ei_abiversion', ctypes.c_ubyte), + ('ei_pad', ctypes.c_ubyte * 7)] + + def __init__(self, endianess, elfclass): + self.ei_mag0 = 0x7F + self.ei_mag1 = ord('E') + self.ei_mag2 = ord('L') + self.ei_mag3 = ord('F') + self.ei_class = elfclass + self.ei_data = endianess + self.ei_version = EV_CURRENT + + +def get_arch_ehdr(endianess, elfclass): + """Returns a EHDR64 class with the specified endianess.""" + + if endianess == ELFDATA2LSB: + superclass = ctypes.LittleEndianStructure + else: + superclass = ctypes.BigEndianStructure + + class EHDR64(superclass): + """Represents the 64 bit ELF header struct.""" + + _fields_ = [('e_ident', Ident), + ('e_type', ctypes.c_uint16), + ('e_machine', ctypes.c_uint16), + ('e_version', ctypes.c_uint32), + ('e_entry', ctypes.c_uint64), + ('e_phoff', ctypes.c_uint64), + ('e_shoff', ctypes.c_uint64), + ('e_flags', ctypes.c_uint32), + ('e_ehsize', ctypes.c_uint16), + ('e_phentsize', ctypes.c_uint16), + ('e_phnum', ctypes.c_uint16), + ('e_shentsize', ctypes.c_uint16), + ('e_shnum', ctypes.c_uint16), + ('e_shstrndx', ctypes.c_uint16)] + + def __init__(self): + super(superclass, self).__init__() + self.e_ident = Ident(endianess, elfclass) + self.e_type = ET_CORE + self.e_version = EV_CURRENT + self.e_ehsize = ctypes.sizeof(self) + self.e_phoff = ctypes.sizeof(self) + self.e_phentsize = ctypes.sizeof(get_arch_phdr(endianess, elfclass)) + self.e_phnum = 0 + + + class EHDR32(superclass): + """Represents the 32 bit ELF header struct.""" + + _fields_ = [('e_ident', Ident), + ('e_type', ctypes.c_uint16), + ('e_machine', ctypes.c_uint16), + ('e_version', ctypes.c_uint32), + ('e_entry', ctypes.c_uint32), + ('e_phoff', ctypes.c_uint32), + ('e_shoff', ctypes.c_uint32), + ('e_flags', ctypes.c_uint32), + ('e_ehsize', ctypes.c_uint16), + ('e_phentsize', ctypes.c_uint16), + ('e_phnum', ctypes.c_uint16), + ('e_shentsize', ctypes.c_uint16), + ('e_shnum', ctypes.c_uint16), + ('e_shstrndx', ctypes.c_uint16)] + + def __init__(self): + super(superclass, self).__init__() + self.e_ident = Ident(endianess, elfclass) + self.e_type = ET_CORE + self.e_version = EV_CURRENT + self.e_ehsize = ctypes.sizeof(self) + self.e_phoff = ctypes.sizeof(self) + self.e_phentsize = ctypes.sizeof(get_arch_phdr(endianess, elfclass)) + self.e_phnum = 0 + + # End get_arch_ehdr + if elfclass == ELFCLASS64: + return EHDR64() + else: + return EHDR32() + + +def get_arch_phdr(endianess, elfclass): + """Returns a 32 or 64 bit PHDR class with the specified endianess.""" + + if endianess == ELFDATA2LSB: + superclass = ctypes.LittleEndianStructure + else: + superclass = ctypes.BigEndianStructure + + class PHDR64(superclass): + """Represents the 64 bit ELF program header struct.""" + + _fields_ = [('p_type', ctypes.c_uint32), + ('p_flags', ctypes.c_uint32), + ('p_offset', ctypes.c_uint64), + ('p_vaddr', ctypes.c_uint64), + ('p_paddr', ctypes.c_uint64), + ('p_filesz', ctypes.c_uint64), + ('p_memsz', ctypes.c_uint64), + ('p_align', ctypes.c_uint64)] + + class PHDR32(superclass): + """Represents the 32 bit ELF program header struct.""" + + _fields_ = [('p_type', ctypes.c_uint32), + ('p_offset', ctypes.c_uint32), + ('p_vaddr', ctypes.c_uint32), + ('p_paddr', ctypes.c_uint32), + ('p_filesz', ctypes.c_uint32), + ('p_memsz', ctypes.c_uint32), + ('p_flags', ctypes.c_uint32), + ('p_align', ctypes.c_uint32)] + + # End get_arch_phdr + if elfclass == ELFCLASS64: + return PHDR64() + else: + return PHDR32() + + +def int128_get64(val): + """Returns low 64bit part of Int128 struct.""" + + assert val["hi"] == 0 + return val["lo"] + + +def qlist_foreach(head, field_str): + """Generator for qlists.""" + + var_p = head["lh_first"] + while var_p != 0: + var = var_p.dereference() + var_p = var[field_str]["le_next"] + yield var + + +def qemu_get_ram_block(ram_addr): + """Returns the RAMBlock struct to which the given address belongs.""" + + ram_blocks = gdb.parse_and_eval("ram_list.blocks") + + for block in qlist_foreach(ram_blocks, "next"): + if (ram_addr - block["offset"]) < block["used_length"]: + return block + + raise gdb.GdbError("Bad ram offset %x" % ram_addr) + + +def qemu_get_ram_ptr(ram_addr): + """Returns qemu vaddr for given guest physical address.""" + + block = qemu_get_ram_block(ram_addr) + return block["host"] + (ram_addr - block["offset"]) + + +def memory_region_get_ram_ptr(memory_region): + if memory_region["alias"] != 0: + return (memory_region_get_ram_ptr(memory_region["alias"].dereference()) + + memory_region["alias_offset"]) + + return qemu_get_ram_ptr(memory_region["ram_addr"] & TARGET_PAGE_MASK) + + +def get_guest_phys_blocks(): + """Returns a list of ram blocks. + + Each block entry contains: + 'target_start': guest block phys start address + 'target_end': guest block phys end address + 'host_addr': qemu vaddr of the block's start + """ + + guest_phys_blocks = [] + + print("guest RAM blocks:") + print("target_start target_end host_addr message " + "count") + print("---------------- ---------------- ---------------- ------- " + "-----") + + current_map_p = gdb.parse_and_eval("address_space_memory.current_map") + current_map = current_map_p.dereference() + + # Conversion to int is needed for python 3 + # compatibility. Otherwise range doesn't cast the value itself and + # breaks. + for cur in range(int(current_map["nr"])): + flat_range = (current_map["ranges"] + cur).dereference() + memory_region = flat_range["mr"].dereference() + + # we only care about RAM + if not memory_region["ram"]: + continue + + section_size = int128_get64(flat_range["addr"]["size"]) + target_start = int128_get64(flat_range["addr"]["start"]) + target_end = target_start + section_size + host_addr = (memory_region_get_ram_ptr(memory_region) + + flat_range["offset_in_region"]) + predecessor = None + + # find continuity in guest physical address space + if len(guest_phys_blocks) > 0: + predecessor = guest_phys_blocks[-1] + predecessor_size = (predecessor["target_end"] - + predecessor["target_start"]) + + # the memory API guarantees monotonically increasing + # traversal + assert predecessor["target_end"] <= target_start + + # we want continuity in both guest-physical and + # host-virtual memory + if (predecessor["target_end"] < target_start or + predecessor["host_addr"] + predecessor_size != host_addr): + predecessor = None + + if predecessor is None: + # isolated mapping, add it to the list + guest_phys_blocks.append({"target_start": target_start, + "target_end": target_end, + "host_addr": host_addr}) + message = "added" + else: + # expand predecessor until @target_end; predecessor's + # start doesn't change + predecessor["target_end"] = target_end + message = "joined" + + print("%016x %016x %016x %-7s %5u" % + (target_start, target_end, host_addr.cast(UINTPTR_T), + message, len(guest_phys_blocks))) + + return guest_phys_blocks + + # The leading docstring doesn't have idiomatic Python formatting. It is # printed by gdb's "help" command (the first line is printed in the # "help data" summary), and it should match how other help texts look in # gdb. - -import struct - class DumpGuestMemory(gdb.Command): """Extract guest vmcore from qemu process coredump. -The sole argument is FILE, identifying the target file to write the -guest vmcore to. +The two required arguments are FILE and ARCH: +FILE identifies the target file to write the guest vmcore to. +ARCH specifies the architecture for which the core will be generated. This GDB command reimplements the dump-guest-memory QMP command in python, using the representation of guest memory as captured in the qemu coredump. The qemu process that has been dumped must have had the -command line option "-machine dump-guest-core=on". +command line option "-machine dump-guest-core=on" which is the default. For simplicity, the "paging", "begin" and "end" parameters of the QMP command are not supported -- no attempt is made to get the guest's internal paging structures (ie. paging=false is hard-wired), and guest memory is always fully dumped. -Only x86_64 guests are supported. +Currently aarch64-be, aarch64-le, X86_64, 386, s390, ppc64-be, +ppc64-le guests are supported. The CORE/NT_PRSTATUS and QEMU notes (that is, the VCPUs' statuses) are not written to the vmcore. Preparing these would require context that is @@ -47,293 +464,66 @@ deliberately called abort(), or it was dumped in response to a signal at a halfway fortunate point, then its coredump should be in reasonable shape and this command should mostly work.""" - TARGET_PAGE_SIZE = 0x1000 - TARGET_PAGE_MASK = 0xFFFFFFFFFFFFF000 - - # Various ELF constants - EM_X86_64 = 62 # AMD x86-64 target machine - ELFDATA2LSB = 1 # little endian - ELFCLASS64 = 2 - ELFMAG = "\x7FELF" - EV_CURRENT = 1 - ET_CORE = 4 - PT_LOAD = 1 - PT_NOTE = 4 - - # Special value for e_phnum. This indicates that the real number of - # program headers is too large to fit into e_phnum. Instead the real - # value is in the field sh_info of section 0. - PN_XNUM = 0xFFFF - - # Format strings for packing and header size calculation. - ELF64_EHDR = ("4s" # e_ident/magic - "B" # e_ident/class - "B" # e_ident/data - "B" # e_ident/version - "B" # e_ident/osabi - "8s" # e_ident/pad - "H" # e_type - "H" # e_machine - "I" # e_version - "Q" # e_entry - "Q" # e_phoff - "Q" # e_shoff - "I" # e_flags - "H" # e_ehsize - "H" # e_phentsize - "H" # e_phnum - "H" # e_shentsize - "H" # e_shnum - "H" # e_shstrndx - ) - ELF64_PHDR = ("I" # p_type - "I" # p_flags - "Q" # p_offset - "Q" # p_vaddr - "Q" # p_paddr - "Q" # p_filesz - "Q" # p_memsz - "Q" # p_align - ) - def __init__(self): super(DumpGuestMemory, self).__init__("dump-guest-memory", gdb.COMMAND_DATA, gdb.COMPLETE_FILENAME) - self.uintptr_t = gdb.lookup_type("uintptr_t") - self.elf64_ehdr_le = struct.Struct("<%s" % self.ELF64_EHDR) - self.elf64_phdr_le = struct.Struct("<%s" % self.ELF64_PHDR) - - def int128_get64(self, val): - assert (val["hi"] == 0) - return val["lo"] - - def qlist_foreach(self, head, field_str): - var_p = head["lh_first"] - while (var_p != 0): - var = var_p.dereference() - yield var - var_p = var[field_str]["le_next"] - - def qemu_get_ram_block(self, ram_addr): - ram_blocks = gdb.parse_and_eval("ram_list.blocks") - for block in self.qlist_foreach(ram_blocks, "next"): - if (ram_addr - block["offset"] < block["used_length"]): - return block - raise gdb.GdbError("Bad ram offset %x" % ram_addr) - - def qemu_get_ram_ptr(self, ram_addr): - block = self.qemu_get_ram_block(ram_addr) - return block["host"] + (ram_addr - block["offset"]) - - def memory_region_get_ram_ptr(self, mr): - if (mr["alias"] != 0): - return (self.memory_region_get_ram_ptr(mr["alias"].dereference()) + - mr["alias_offset"]) - return self.qemu_get_ram_ptr(mr["ram_addr"] & self.TARGET_PAGE_MASK) - - def guest_phys_blocks_init(self): - self.guest_phys_blocks = [] - - def guest_phys_blocks_append(self): - print "guest RAM blocks:" - print ("target_start target_end host_addr message " - "count") - print ("---------------- ---------------- ---------------- ------- " - "-----") - - current_map_p = gdb.parse_and_eval("address_space_memory.current_map") - current_map = current_map_p.dereference() - for cur in range(current_map["nr"]): - flat_range = (current_map["ranges"] + cur).dereference() - mr = flat_range["mr"].dereference() - - # we only care about RAM - if (not mr["ram"]): - continue - - section_size = self.int128_get64(flat_range["addr"]["size"]) - target_start = self.int128_get64(flat_range["addr"]["start"]) - target_end = target_start + section_size - host_addr = (self.memory_region_get_ram_ptr(mr) + - flat_range["offset_in_region"]) - predecessor = None - - # find continuity in guest physical address space - if (len(self.guest_phys_blocks) > 0): - predecessor = self.guest_phys_blocks[-1] - predecessor_size = (predecessor["target_end"] - - predecessor["target_start"]) - - # the memory API guarantees monotonically increasing - # traversal - assert (predecessor["target_end"] <= target_start) - - # we want continuity in both guest-physical and - # host-virtual memory - if (predecessor["target_end"] < target_start or - predecessor["host_addr"] + predecessor_size != host_addr): - predecessor = None - - if (predecessor is None): - # isolated mapping, add it to the list - self.guest_phys_blocks.append({"target_start": target_start, - "target_end" : target_end, - "host_addr" : host_addr}) - message = "added" - else: - # expand predecessor until @target_end; predecessor's - # start doesn't change - predecessor["target_end"] = target_end - message = "joined" - - print ("%016x %016x %016x %-7s %5u" % - (target_start, target_end, host_addr.cast(self.uintptr_t), - message, len(self.guest_phys_blocks))) - - def cpu_get_dump_info(self): - # We can't synchronize the registers with KVM post-mortem, and - # the bits in (first_x86_cpu->env.hflags) seem to be stale; they - # may not reflect long mode for example. Hence just assume the - # most common values. This also means that instruction pointer - # etc. will be bogus in the dump, but at least the RAM contents - # should be valid. - self.dump_info = {"d_machine": self.EM_X86_64, - "d_endian" : self.ELFDATA2LSB, - "d_class" : self.ELFCLASS64} - - def encode_elf64_ehdr_le(self): - return self.elf64_ehdr_le.pack( - self.ELFMAG, # e_ident/magic - self.dump_info["d_class"], # e_ident/class - self.dump_info["d_endian"], # e_ident/data - self.EV_CURRENT, # e_ident/version - 0, # e_ident/osabi - "", # e_ident/pad - self.ET_CORE, # e_type - self.dump_info["d_machine"], # e_machine - self.EV_CURRENT, # e_version - 0, # e_entry - self.elf64_ehdr_le.size, # e_phoff - 0, # e_shoff - 0, # e_flags - self.elf64_ehdr_le.size, # e_ehsize - self.elf64_phdr_le.size, # e_phentsize - self.phdr_num, # e_phnum - 0, # e_shentsize - 0, # e_shnum - 0 # e_shstrndx - ) - - def encode_elf64_note_le(self): - return self.elf64_phdr_le.pack(self.PT_NOTE, # p_type - 0, # p_flags - (self.memory_offset - - len(self.note)), # p_offset - 0, # p_vaddr - 0, # p_paddr - len(self.note), # p_filesz - len(self.note), # p_memsz - 0 # p_align - ) - - def encode_elf64_load_le(self, offset, start_hwaddr, range_size): - return self.elf64_phdr_le.pack(self.PT_LOAD, # p_type - 0, # p_flags - offset, # p_offset - 0, # p_vaddr - start_hwaddr, # p_paddr - range_size, # p_filesz - range_size, # p_memsz - 0 # p_align - ) - - def note_init(self, name, desc, type): - # name must include a trailing NUL - namesz = (len(name) + 1 + 3) / 4 * 4 - descsz = (len(desc) + 3) / 4 * 4 - fmt = ("<" # little endian - "I" # n_namesz - "I" # n_descsz - "I" # n_type - "%us" # name - "%us" # desc - % (namesz, descsz)) - self.note = struct.pack(fmt, - len(name) + 1, len(desc), type, name, desc) - - def dump_init(self): - self.guest_phys_blocks_init() - self.guest_phys_blocks_append() - self.cpu_get_dump_info() - # we have no way to retrieve the VCPU status from KVM - # post-mortem - self.note_init("NONE", "EMPTY", 0) - - # Account for PT_NOTE. - self.phdr_num = 1 - - # We should never reach PN_XNUM for paging=false dumps: there's - # just a handful of discontiguous ranges after merging. - self.phdr_num += len(self.guest_phys_blocks) - assert (self.phdr_num < self.PN_XNUM) - - # Calculate the ELF file offset where the memory dump commences: - # - # ELF header - # PT_NOTE - # PT_LOAD: 1 - # PT_LOAD: 2 - # ... - # PT_LOAD: len(self.guest_phys_blocks) - # ELF note - # memory dump - self.memory_offset = (self.elf64_ehdr_le.size + - self.elf64_phdr_le.size * self.phdr_num + - len(self.note)) - - def dump_begin(self, vmcore): - vmcore.write(self.encode_elf64_ehdr_le()) - vmcore.write(self.encode_elf64_note_le()) - running = self.memory_offset + self.elf = None + self.guest_phys_blocks = None + + def dump_init(self, vmcore): + """Prepares and writes ELF structures to core file.""" + + # Needed to make crash happy, data for more useful notes is + # not available in a qemu core. + self.elf.add_note("NONE", "EMPTY", 0) + + # We should never reach PN_XNUM for paging=false dumps, + # there's just a handful of discontiguous ranges after + # merging. + # The constant is needed to account for the PT_NOTE segment. + phdr_num = len(self.guest_phys_blocks) + 1 + assert phdr_num < PN_XNUM + for block in self.guest_phys_blocks: - range_size = block["target_end"] - block["target_start"] - vmcore.write(self.encode_elf64_load_le(running, - block["target_start"], - range_size)) - running += range_size - vmcore.write(self.note) + block_size = block["target_end"] - block["target_start"] + self.elf.add_segment(PT_LOAD, block["target_start"], block_size) + + self.elf.to_file(vmcore) def dump_iterate(self, vmcore): + """Writes guest core to file.""" + qemu_core = gdb.inferiors()[0] for block in self.guest_phys_blocks: - cur = block["host_addr"] + cur = block["host_addr"] left = block["target_end"] - block["target_start"] - print ("dumping range at %016x for length %016x" % - (cur.cast(self.uintptr_t), left)) - while (left > 0): - chunk_size = min(self.TARGET_PAGE_SIZE, left) + print("dumping range at %016x for length %016x" % + (cur.cast(UINTPTR_T), left)) + + while left > 0: + chunk_size = min(TARGET_PAGE_SIZE, left) chunk = qemu_core.read_memory(cur, chunk_size) vmcore.write(chunk) - cur += chunk_size + cur += chunk_size left -= chunk_size - def create_vmcore(self, filename): - vmcore = open(filename, "wb") - self.dump_begin(vmcore) - self.dump_iterate(vmcore) - vmcore.close() - def invoke(self, args, from_tty): + """Handles command invocation from gdb.""" + # Unwittingly pressing the Enter key after the command should # not dump the same multi-gig coredump to the same file. self.dont_repeat() argv = gdb.string_to_argv(args) - if (len(argv) != 1): - raise gdb.GdbError("usage: dump-guest-memory FILE") + if len(argv) != 2: + raise gdb.GdbError("usage: dump-guest-memory FILE ARCH") + + self.elf = ELF(argv[1]) + self.guest_phys_blocks = get_guest_phys_blocks() - self.dump_init() - self.create_vmcore(argv[0]) + with open(argv[0], "wb") as vmcore: + self.dump_init(vmcore) + self.dump_iterate(vmcore) DumpGuestMemory() diff --git a/scripts/kvm/kvm_stat b/scripts/kvm/kvm_stat index 7e5d25612b..d43e8f3e85 100755 --- a/scripts/kvm/kvm_stat +++ b/scripts/kvm/kvm_stat @@ -12,285 +12,311 @@ # the COPYING file in the top-level directory. import curses -import sys, os, time, optparse, ctypes -from ctypes import * - -class DebugfsProvider(object): - def __init__(self): - self.base = '/sys/kernel/debug/kvm' - self._fields = os.listdir(self.base) - def fields(self): - return self._fields - def select(self, fields): - self._fields = fields - def read(self): - def val(key): - return int(file(self.base + '/' + key).read()) - return dict([(key, val(key)) for key in self._fields]) - -vmx_exit_reasons = { - 0: 'EXCEPTION_NMI', - 1: 'EXTERNAL_INTERRUPT', - 2: 'TRIPLE_FAULT', - 7: 'PENDING_INTERRUPT', - 8: 'NMI_WINDOW', - 9: 'TASK_SWITCH', - 10: 'CPUID', - 12: 'HLT', - 14: 'INVLPG', - 15: 'RDPMC', - 16: 'RDTSC', - 18: 'VMCALL', - 19: 'VMCLEAR', - 20: 'VMLAUNCH', - 21: 'VMPTRLD', - 22: 'VMPTRST', - 23: 'VMREAD', - 24: 'VMRESUME', - 25: 'VMWRITE', - 26: 'VMOFF', - 27: 'VMON', - 28: 'CR_ACCESS', - 29: 'DR_ACCESS', - 30: 'IO_INSTRUCTION', - 31: 'MSR_READ', - 32: 'MSR_WRITE', - 33: 'INVALID_STATE', - 36: 'MWAIT_INSTRUCTION', - 39: 'MONITOR_INSTRUCTION', - 40: 'PAUSE_INSTRUCTION', - 41: 'MCE_DURING_VMENTRY', - 43: 'TPR_BELOW_THRESHOLD', - 44: 'APIC_ACCESS', - 48: 'EPT_VIOLATION', - 49: 'EPT_MISCONFIG', - 54: 'WBINVD', - 55: 'XSETBV', - 56: 'APIC_WRITE', - 58: 'INVPCID', +import sys +import os +import time +import optparse +import ctypes +import fcntl +import resource +import struct +import re +from collections import defaultdict + +VMX_EXIT_REASONS = { + 'EXCEPTION_NMI': 0, + 'EXTERNAL_INTERRUPT': 1, + 'TRIPLE_FAULT': 2, + 'PENDING_INTERRUPT': 7, + 'NMI_WINDOW': 8, + 'TASK_SWITCH': 9, + 'CPUID': 10, + 'HLT': 12, + 'INVLPG': 14, + 'RDPMC': 15, + 'RDTSC': 16, + 'VMCALL': 18, + 'VMCLEAR': 19, + 'VMLAUNCH': 20, + 'VMPTRLD': 21, + 'VMPTRST': 22, + 'VMREAD': 23, + 'VMRESUME': 24, + 'VMWRITE': 25, + 'VMOFF': 26, + 'VMON': 27, + 'CR_ACCESS': 28, + 'DR_ACCESS': 29, + 'IO_INSTRUCTION': 30, + 'MSR_READ': 31, + 'MSR_WRITE': 32, + 'INVALID_STATE': 33, + 'MWAIT_INSTRUCTION': 36, + 'MONITOR_INSTRUCTION': 39, + 'PAUSE_INSTRUCTION': 40, + 'MCE_DURING_VMENTRY': 41, + 'TPR_BELOW_THRESHOLD': 43, + 'APIC_ACCESS': 44, + 'EPT_VIOLATION': 48, + 'EPT_MISCONFIG': 49, + 'WBINVD': 54, + 'XSETBV': 55, + 'APIC_WRITE': 56, + 'INVPCID': 58, } -svm_exit_reasons = { - 0x000: 'READ_CR0', - 0x003: 'READ_CR3', - 0x004: 'READ_CR4', - 0x008: 'READ_CR8', - 0x010: 'WRITE_CR0', - 0x013: 'WRITE_CR3', - 0x014: 'WRITE_CR4', - 0x018: 'WRITE_CR8', - 0x020: 'READ_DR0', - 0x021: 'READ_DR1', - 0x022: 'READ_DR2', - 0x023: 'READ_DR3', - 0x024: 'READ_DR4', - 0x025: 'READ_DR5', - 0x026: 'READ_DR6', - 0x027: 'READ_DR7', - 0x030: 'WRITE_DR0', - 0x031: 'WRITE_DR1', - 0x032: 'WRITE_DR2', - 0x033: 'WRITE_DR3', - 0x034: 'WRITE_DR4', - 0x035: 'WRITE_DR5', - 0x036: 'WRITE_DR6', - 0x037: 'WRITE_DR7', - 0x040: 'EXCP_BASE', - 0x060: 'INTR', - 0x061: 'NMI', - 0x062: 'SMI', - 0x063: 'INIT', - 0x064: 'VINTR', - 0x065: 'CR0_SEL_WRITE', - 0x066: 'IDTR_READ', - 0x067: 'GDTR_READ', - 0x068: 'LDTR_READ', - 0x069: 'TR_READ', - 0x06a: 'IDTR_WRITE', - 0x06b: 'GDTR_WRITE', - 0x06c: 'LDTR_WRITE', - 0x06d: 'TR_WRITE', - 0x06e: 'RDTSC', - 0x06f: 'RDPMC', - 0x070: 'PUSHF', - 0x071: 'POPF', - 0x072: 'CPUID', - 0x073: 'RSM', - 0x074: 'IRET', - 0x075: 'SWINT', - 0x076: 'INVD', - 0x077: 'PAUSE', - 0x078: 'HLT', - 0x079: 'INVLPG', - 0x07a: 'INVLPGA', - 0x07b: 'IOIO', - 0x07c: 'MSR', - 0x07d: 'TASK_SWITCH', - 0x07e: 'FERR_FREEZE', - 0x07f: 'SHUTDOWN', - 0x080: 'VMRUN', - 0x081: 'VMMCALL', - 0x082: 'VMLOAD', - 0x083: 'VMSAVE', - 0x084: 'STGI', - 0x085: 'CLGI', - 0x086: 'SKINIT', - 0x087: 'RDTSCP', - 0x088: 'ICEBP', - 0x089: 'WBINVD', - 0x08a: 'MONITOR', - 0x08b: 'MWAIT', - 0x08c: 'MWAIT_COND', - 0x08d: 'XSETBV', - 0x400: 'NPF', +SVM_EXIT_REASONS = { + 'READ_CR0': 0x000, + 'READ_CR3': 0x003, + 'READ_CR4': 0x004, + 'READ_CR8': 0x008, + 'WRITE_CR0': 0x010, + 'WRITE_CR3': 0x013, + 'WRITE_CR4': 0x014, + 'WRITE_CR8': 0x018, + 'READ_DR0': 0x020, + 'READ_DR1': 0x021, + 'READ_DR2': 0x022, + 'READ_DR3': 0x023, + 'READ_DR4': 0x024, + 'READ_DR5': 0x025, + 'READ_DR6': 0x026, + 'READ_DR7': 0x027, + 'WRITE_DR0': 0x030, + 'WRITE_DR1': 0x031, + 'WRITE_DR2': 0x032, + 'WRITE_DR3': 0x033, + 'WRITE_DR4': 0x034, + 'WRITE_DR5': 0x035, + 'WRITE_DR6': 0x036, + 'WRITE_DR7': 0x037, + 'EXCP_BASE': 0x040, + 'INTR': 0x060, + 'NMI': 0x061, + 'SMI': 0x062, + 'INIT': 0x063, + 'VINTR': 0x064, + 'CR0_SEL_WRITE': 0x065, + 'IDTR_READ': 0x066, + 'GDTR_READ': 0x067, + 'LDTR_READ': 0x068, + 'TR_READ': 0x069, + 'IDTR_WRITE': 0x06a, + 'GDTR_WRITE': 0x06b, + 'LDTR_WRITE': 0x06c, + 'TR_WRITE': 0x06d, + 'RDTSC': 0x06e, + 'RDPMC': 0x06f, + 'PUSHF': 0x070, + 'POPF': 0x071, + 'CPUID': 0x072, + 'RSM': 0x073, + 'IRET': 0x074, + 'SWINT': 0x075, + 'INVD': 0x076, + 'PAUSE': 0x077, + 'HLT': 0x078, + 'INVLPG': 0x079, + 'INVLPGA': 0x07a, + 'IOIO': 0x07b, + 'MSR': 0x07c, + 'TASK_SWITCH': 0x07d, + 'FERR_FREEZE': 0x07e, + 'SHUTDOWN': 0x07f, + 'VMRUN': 0x080, + 'VMMCALL': 0x081, + 'VMLOAD': 0x082, + 'VMSAVE': 0x083, + 'STGI': 0x084, + 'CLGI': 0x085, + 'SKINIT': 0x086, + 'RDTSCP': 0x087, + 'ICEBP': 0x088, + 'WBINVD': 0x089, + 'MONITOR': 0x08a, + 'MWAIT': 0x08b, + 'MWAIT_COND': 0x08c, + 'XSETBV': 0x08d, + 'NPF': 0x400, } # EC definition of HSR (from arch/arm64/include/asm/kvm_arm.h) -aarch64_exit_reasons = { - 0x00: 'UNKNOWN', - 0x01: 'WFI', - 0x03: 'CP15_32', - 0x04: 'CP15_64', - 0x05: 'CP14_MR', - 0x06: 'CP14_LS', - 0x07: 'FP_ASIMD', - 0x08: 'CP10_ID', - 0x0C: 'CP14_64', - 0x0E: 'ILL_ISS', - 0x11: 'SVC32', - 0x12: 'HVC32', - 0x13: 'SMC32', - 0x15: 'SVC64', - 0x16: 'HVC64', - 0x17: 'SMC64', - 0x18: 'SYS64', - 0x20: 'IABT', - 0x21: 'IABT_HYP', - 0x22: 'PC_ALIGN', - 0x24: 'DABT', - 0x25: 'DABT_HYP', - 0x26: 'SP_ALIGN', - 0x28: 'FP_EXC32', - 0x2C: 'FP_EXC64', - 0x2F: 'SERROR', - 0x30: 'BREAKPT', - 0x31: 'BREAKPT_HYP', - 0x32: 'SOFTSTP', - 0x33: 'SOFTSTP_HYP', - 0x34: 'WATCHPT', - 0x35: 'WATCHPT_HYP', - 0x38: 'BKPT32', - 0x3A: 'VECTOR32', - 0x3C: 'BRK64', +AARCH64_EXIT_REASONS = { + 'UNKNOWN': 0x00, + 'WFI': 0x01, + 'CP15_32': 0x03, + 'CP15_64': 0x04, + 'CP14_MR': 0x05, + 'CP14_LS': 0x06, + 'FP_ASIMD': 0x07, + 'CP10_ID': 0x08, + 'CP14_64': 0x0C, + 'ILL_ISS': 0x0E, + 'SVC32': 0x11, + 'HVC32': 0x12, + 'SMC32': 0x13, + 'SVC64': 0x15, + 'HVC64': 0x16, + 'SMC64': 0x17, + 'SYS64': 0x18, + 'IABT': 0x20, + 'IABT_HYP': 0x21, + 'PC_ALIGN': 0x22, + 'DABT': 0x24, + 'DABT_HYP': 0x25, + 'SP_ALIGN': 0x26, + 'FP_EXC32': 0x28, + 'FP_EXC64': 0x2C, + 'SERROR': 0x2F, + 'BREAKPT': 0x30, + 'BREAKPT_HYP': 0x31, + 'SOFTSTP': 0x32, + 'SOFTSTP_HYP': 0x33, + 'WATCHPT': 0x34, + 'WATCHPT_HYP': 0x35, + 'BKPT32': 0x38, + 'VECTOR32': 0x3A, + 'BRK64': 0x3C, } # From include/uapi/linux/kvm.h, KVM_EXIT_xxx -userspace_exit_reasons = { - 0: 'UNKNOWN', - 1: 'EXCEPTION', - 2: 'IO', - 3: 'HYPERCALL', - 4: 'DEBUG', - 5: 'HLT', - 6: 'MMIO', - 7: 'IRQ_WINDOW_OPEN', - 8: 'SHUTDOWN', - 9: 'FAIL_ENTRY', - 10: 'INTR', - 11: 'SET_TPR', - 12: 'TPR_ACCESS', - 13: 'S390_SIEIC', - 14: 'S390_RESET', - 15: 'DCR', - 16: 'NMI', - 17: 'INTERNAL_ERROR', - 18: 'OSI', - 19: 'PAPR_HCALL', - 20: 'S390_UCONTROL', - 21: 'WATCHDOG', - 22: 'S390_TSCH', - 23: 'EPR', - 24: 'SYSTEM_EVENT', +USERSPACE_EXIT_REASONS = { + 'UNKNOWN': 0, + 'EXCEPTION': 1, + 'IO': 2, + 'HYPERCALL': 3, + 'DEBUG': 4, + 'HLT': 5, + 'MMIO': 6, + 'IRQ_WINDOW_OPEN': 7, + 'SHUTDOWN': 8, + 'FAIL_ENTRY': 9, + 'INTR': 10, + 'SET_TPR': 11, + 'TPR_ACCESS': 12, + 'S390_SIEIC': 13, + 'S390_RESET': 14, + 'DCR': 15, + 'NMI': 16, + 'INTERNAL_ERROR': 17, + 'OSI': 18, + 'PAPR_HCALL': 19, + 'S390_UCONTROL': 20, + 'WATCHDOG': 21, + 'S390_TSCH': 22, + 'EPR': 23, + 'SYSTEM_EVENT': 24, } -x86_exit_reasons = { - 'vmx': vmx_exit_reasons, - 'svm': svm_exit_reasons, +IOCTL_NUMBERS = { + 'SET_FILTER': 0x40082406, + 'ENABLE': 0x00002400, + 'DISABLE': 0x00002401, + 'RESET': 0x00002403, } -sc_perf_evt_open = None -exit_reasons = None +class Arch(object): + """Class that encapsulates global architecture specific data like + syscall and ioctl numbers. + + """ + @staticmethod + def get_arch(): + machine = os.uname()[4] + + if machine.startswith('ppc'): + return ArchPPC() + elif machine.startswith('aarch64'): + return ArchA64() + elif machine.startswith('s390'): + return ArchS390() + else: + # X86_64 + for line in open('/proc/cpuinfo'): + if not line.startswith('flags'): + continue + + flags = line.split() + if 'vmx' in flags: + return ArchX86(VMX_EXIT_REASONS) + if 'svm' in flags: + return ArchX86(SVM_EXIT_REASONS) + return + +class ArchX86(Arch): + def __init__(self, exit_reasons): + self.sc_perf_evt_open = 298 + self.ioctl_numbers = IOCTL_NUMBERS + self.exit_reasons = exit_reasons + +class ArchPPC(Arch): + def __init__(self): + self.sc_perf_evt_open = 319 + self.ioctl_numbers = IOCTL_NUMBERS + self.ioctl_numbers['ENABLE'] = 0x20002400 + self.ioctl_numbers['DISABLE'] = 0x20002401 -ioctl_numbers = { - 'SET_FILTER' : 0x40082406, - 'ENABLE' : 0x00002400, - 'DISABLE' : 0x00002401, - 'RESET' : 0x00002403, -} + # PPC comes in 32 and 64 bit and some generated ioctl + # numbers depend on the wordsize. + char_ptr_size = ctypes.sizeof(ctypes.c_char_p) + self.ioctl_numbers['SET_FILTER'] = 0x80002406 | char_ptr_size << 16 + +class ArchA64(Arch): + def __init__(self): + self.sc_perf_evt_open = 241 + self.ioctl_numbers = IOCTL_NUMBERS + self.exit_reasons = AARCH64_EXIT_REASONS + +class ArchS390(Arch): + def __init__(self): + self.sc_perf_evt_open = 331 + self.ioctl_numbers = IOCTL_NUMBERS + self.exit_reasons = None + +ARCH = Arch.get_arch() + + +def walkdir(path): + """Returns os.walk() data for specified directory. + + As it is only a wrapper it returns the same 3-tuple of (dirpath, + dirnames, filenames). + """ + return next(os.walk(path)) + + +def parse_int_list(list_string): + """Returns an int list from a string of comma separated integers and + integer ranges.""" + integers = [] + members = list_string.split(',') -def x86_init(flag): - globals().update({ - 'sc_perf_evt_open' : 298, - 'exit_reasons' : x86_exit_reasons[flag], - }) - -def s390_init(): - globals().update({ - 'sc_perf_evt_open' : 331 - }) - -def ppc_init(): - globals().update({ - 'sc_perf_evt_open' : 319, - 'ioctl_numbers' : { - 'SET_FILTER' : 0x80002406 | (ctypes.sizeof(ctypes.c_char_p) << 16), - 'ENABLE' : 0x20002400, - 'DISABLE' : 0x20002401, - } - }) - -def aarch64_init(): - globals().update({ - 'sc_perf_evt_open' : 241, - 'exit_reasons' : aarch64_exit_reasons, - }) - -def detect_platform(): - if os.uname()[4].startswith('ppc'): - ppc_init() - return - elif os.uname()[4].startswith('aarch64'): - aarch64_init() - return - - for line in file('/proc/cpuinfo').readlines(): - if line.startswith('flags'): - for flag in line.split(): - if flag in x86_exit_reasons: - x86_init(flag) - return - elif line.startswith('vendor_id'): - for flag in line.split(): - if flag == 'IBM/S390': - s390_init() - return - -detect_platform() - -def invert(d): - return dict((x[1], x[0]) for x in d.iteritems()) - -filters = {} -filters['kvm_userspace_exit'] = ('reason', invert(userspace_exit_reasons)) -if exit_reasons: - filters['kvm_exit'] = ('exit_reason', invert(exit_reasons)) - -import struct, array - -libc = ctypes.CDLL('libc.so.6') + for member in members: + if '-' not in member: + integers.append(int(member)) + else: + int_range = member.split('-') + integers.extend(range(int(int_range[0]), + int(int_range[1]) + 1)) + + return integers + + +def get_online_cpus(): + with open('/sys/devices/system/cpu/online') as cpu_list: + cpu_string = cpu_list.readline() + return parse_int_list(cpu_string) + + +def get_filters(): + filters = {} + filters['kvm_userspace_exit'] = ('reason', USERSPACE_EXIT_REASONS) + if ARCH.exit_reasons: + filters['kvm_exit'] = ('exit_reason', ARCH.exit_reasons) + return filters + +libc = ctypes.CDLL('libc.so.6', use_errno=True) syscall = libc.syscall -get_errno = libc.__errno_location -get_errno.restype = POINTER(c_int) class perf_event_attr(ctypes.Structure): _fields_ = [('type', ctypes.c_uint32), @@ -305,262 +331,350 @@ class perf_event_attr(ctypes.Structure): ('bp_addr', ctypes.c_uint64), ('bp_len', ctypes.c_uint64), ] -def _perf_event_open(attr, pid, cpu, group_fd, flags): - return syscall(sc_perf_evt_open, ctypes.pointer(attr), ctypes.c_int(pid), - ctypes.c_int(cpu), ctypes.c_int(group_fd), - ctypes.c_long(flags)) - -PERF_TYPE_HARDWARE = 0 -PERF_TYPE_SOFTWARE = 1 -PERF_TYPE_TRACEPOINT = 2 -PERF_TYPE_HW_CACHE = 3 -PERF_TYPE_RAW = 4 -PERF_TYPE_BREAKPOINT = 5 - -PERF_SAMPLE_IP = 1 << 0 -PERF_SAMPLE_TID = 1 << 1 -PERF_SAMPLE_TIME = 1 << 2 -PERF_SAMPLE_ADDR = 1 << 3 -PERF_SAMPLE_READ = 1 << 4 -PERF_SAMPLE_CALLCHAIN = 1 << 5 -PERF_SAMPLE_ID = 1 << 6 -PERF_SAMPLE_CPU = 1 << 7 -PERF_SAMPLE_PERIOD = 1 << 8 -PERF_SAMPLE_STREAM_ID = 1 << 9 -PERF_SAMPLE_RAW = 1 << 10 - -PERF_FORMAT_TOTAL_TIME_ENABLED = 1 << 0 -PERF_FORMAT_TOTAL_TIME_RUNNING = 1 << 1 -PERF_FORMAT_ID = 1 << 2 -PERF_FORMAT_GROUP = 1 << 3 -import re + def __init__(self): + super(self.__class__, self).__init__() + self.type = PERF_TYPE_TRACEPOINT + self.size = ctypes.sizeof(self) + self.read_format = PERF_FORMAT_GROUP + +def perf_event_open(attr, pid, cpu, group_fd, flags): + return syscall(ARCH.sc_perf_evt_open, ctypes.pointer(attr), + ctypes.c_int(pid), ctypes.c_int(cpu), + ctypes.c_int(group_fd), ctypes.c_long(flags)) -sys_tracing = '/sys/kernel/debug/tracing' +PERF_TYPE_TRACEPOINT = 2 +PERF_FORMAT_GROUP = 1 << 3 + +PATH_DEBUGFS_TRACING = '/sys/kernel/debug/tracing' +PATH_DEBUGFS_KVM = '/sys/kernel/debug/kvm' class Group(object): - def __init__(self, cpu): + def __init__(self): self.events = [] - self.group_leader = None - self.cpu = cpu - def add_event(self, name, event_set, tracepoint, filter = None): - self.events.append(Event(group = self, - name = name, event_set = event_set, - tracepoint = tracepoint, filter = filter)) - if len(self.events) == 1: - self.file = os.fdopen(self.events[0].fd) + + def add_event(self, event): + self.events.append(event) + def read(self): - bytes = 8 * (1 + len(self.events)) - fmt = 'xxxxxxxx' + 'q' * len(self.events) + length = 8 * (1 + len(self.events)) + read_format = 'xxxxxxxx' + 'Q' * len(self.events) return dict(zip([event.name for event in self.events], - struct.unpack(fmt, self.file.read(bytes)))) + struct.unpack(read_format, + os.read(self.events[0].fd, length)))) class Event(object): - def __init__(self, group, name, event_set, tracepoint, filter = None): + def __init__(self, name, group, trace_cpu, trace_point, trace_filter, + trace_set='kvm'): self.name = name - attr = perf_event_attr() - attr.type = PERF_TYPE_TRACEPOINT - attr.size = ctypes.sizeof(attr) - id_path = os.path.join(sys_tracing, 'events', event_set, - tracepoint, 'id') - id = int(file(id_path).read()) - attr.config = id - attr.sample_type = (PERF_SAMPLE_RAW - | PERF_SAMPLE_TIME - | PERF_SAMPLE_CPU) - attr.sample_period = 1 - attr.read_format = PERF_FORMAT_GROUP + self.fd = None + self.setup_event(group, trace_cpu, trace_point, trace_filter, + trace_set) + + def setup_event_attribute(self, trace_set, trace_point): + id_path = os.path.join(PATH_DEBUGFS_TRACING, 'events', trace_set, + trace_point, 'id') + + event_attr = perf_event_attr() + event_attr.config = int(open(id_path).read()) + return event_attr + + def setup_event(self, group, trace_cpu, trace_point, trace_filter, + trace_set): + event_attr = self.setup_event_attribute(trace_set, trace_point) + group_leader = -1 if group.events: group_leader = group.events[0].fd - fd = _perf_event_open(attr, -1, group.cpu, group_leader, 0) + + fd = perf_event_open(event_attr, -1, trace_cpu, + group_leader, 0) if fd == -1: - err = get_errno()[0] - raise Exception('perf_event_open failed, errno = ' + err.__str__()) - if filter: - import fcntl - fcntl.ioctl(fd, ioctl_numbers['SET_FILTER'], filter) + err = ctypes.get_errno() + raise OSError(err, os.strerror(err), + 'while calling sys_perf_event_open().') + + if trace_filter: + fcntl.ioctl(fd, ARCH.ioctl_numbers['SET_FILTER'], + trace_filter) + self.fd = fd + def enable(self): - import fcntl - fcntl.ioctl(self.fd, ioctl_numbers['ENABLE'], 0) + fcntl.ioctl(self.fd, ARCH.ioctl_numbers['ENABLE'], 0) + def disable(self): - import fcntl - fcntl.ioctl(self.fd, ioctl_numbers['DISABLE'], 0) + fcntl.ioctl(self.fd, ARCH.ioctl_numbers['DISABLE'], 0) + def reset(self): - import fcntl - fcntl.ioctl(self.fd, ioctl_numbers['RESET'], 0) + fcntl.ioctl(self.fd, ARCH.ioctl_numbers['RESET'], 0) class TracepointProvider(object): def __init__(self): - path = os.path.join(sys_tracing, 'events', 'kvm') - fields = [f - for f in os.listdir(path) - if os.path.isdir(os.path.join(path, f))] + self.group_leaders = [] + self.filters = get_filters() + self._fields = self.get_available_fields() + self.setup_traces() + self.fields = self._fields + + def get_available_fields(self): + path = os.path.join(PATH_DEBUGFS_TRACING, 'events', 'kvm') + fields = walkdir(path)[1] extra = [] - for f in fields: - if f in filters: - subfield, values = filters[f] - for name, number in values.iteritems(): - extra.append(f + '(' + name + ')') + for field in fields: + if field in self.filters: + filter_name_, filter_dicts = self.filters[field] + for name in filter_dicts: + extra.append(field + '(' + name + ')') fields += extra - self._setup(fields) - self.select(fields) - def fields(self): - return self._fields + return fields + + def setup_traces(self): + cpus = get_online_cpus() + + # The constant is needed as a buffer for python libs, std + # streams and other files that the script opens. + newlim = len(cpus) * len(self._fields) + 50 + try: + softlim_, hardlim = resource.getrlimit(resource.RLIMIT_NOFILE) + + if hardlim < newlim: + # Now we need CAP_SYS_RESOURCE, to increase the hard limit. + resource.setrlimit(resource.RLIMIT_NOFILE, (newlim, newlim)) + else: + # Raising the soft limit is sufficient. + resource.setrlimit(resource.RLIMIT_NOFILE, (newlim, hardlim)) + + except ValueError: + sys.exit("NOFILE rlimit could not be raised to {0}".format(newlim)) - def _online_cpus(self): - l = [] - pattern = r'cpu([0-9]+)' - basedir = '/sys/devices/system/cpu' - for entry in os.listdir(basedir): - match = re.match(pattern, entry) - if not match: - continue - path = os.path.join(basedir, entry, 'online') - if os.path.exists(path) and open(path).read().strip() != '1': - continue - l.append(int(match.group(1))) - return l - - def _setup(self, _fields): - self._fields = _fields - cpus = self._online_cpus() - import resource - nfiles = len(cpus) * 1000 - resource.setrlimit(resource.RLIMIT_NOFILE, (nfiles, nfiles)) - events = [] - self.group_leaders = [] for cpu in cpus: - group = Group(cpu) - for name in _fields: + group = Group() + for name in self._fields: tracepoint = name - filter = None - m = re.match(r'(.*)\((.*)\)', name) - if m: - tracepoint, sub = m.groups() - filter = '%s==%d\0' % (filters[tracepoint][0], - filters[tracepoint][1][sub]) - event = group.add_event(name, event_set = 'kvm', - tracepoint = tracepoint, - filter = filter) + tracefilter = None + match = re.match(r'(.*)\((.*)\)', name) + if match: + tracepoint, sub = match.groups() + tracefilter = ('%s==%d\0' % + (self.filters[tracepoint][0], + self.filters[tracepoint][1][sub])) + + group.add_event(Event(name=name, + group=group, + trace_cpu=cpu, + trace_point=tracepoint, + trace_filter=tracefilter)) self.group_leaders.append(group) - def select(self, fields): + + def available_fields(self): + return self.get_available_fields() + + @property + def fields(self): + return self._fields + + @fields.setter + def fields(self, fields): + self._fields = fields for group in self.group_leaders: - for event in group.events: + for index, event in enumerate(group.events): if event.name in fields: event.reset() event.enable() else: - event.disable() + # Do not disable the group leader. + # It would disable all of its events. + if index != 0: + event.disable() + def read(self): - from collections import defaultdict ret = defaultdict(int) for group in self.group_leaders: for name, val in group.read().iteritems(): - ret[name] += val + if name in self._fields: + ret[name] += val return ret -class Stats: - def __init__(self, providers, fields = None): +class DebugfsProvider(object): + def __init__(self): + self._fields = self.get_available_fields() + + def get_available_fields(self): + return walkdir(PATH_DEBUGFS_KVM)[2] + + @property + def fields(self): + return self._fields + + @fields.setter + def fields(self, fields): + self._fields = fields + + def read(self): + def val(key): + return int(file(PATH_DEBUGFS_KVM + '/' + key).read()) + return dict([(key, val(key)) for key in self._fields]) + +class Stats(object): + def __init__(self, providers, fields=None): self.providers = providers - self.fields_filter = fields - self._update() - def _update(self): + self._fields_filter = fields + self.values = {} + self.update_provider_filters() + + def update_provider_filters(self): def wanted(key): - import re - if not self.fields_filter: + if not self._fields_filter: return True - return re.match(self.fields_filter, key) is not None - self.values = dict() - for d in providers: - provider_fields = [key for key in d.fields() if wanted(key)] - for key in provider_fields: - self.values[key] = None - d.select(provider_fields) - def set_fields_filter(self, fields_filter): - self.fields_filter = fields_filter - self._update() + return re.match(self._fields_filter, key) is not None + + # As we reset the counters when updating the fields we can + # also clear the cache of old values. + self.values = {} + for provider in self.providers: + provider_fields = [key for key in provider.get_available_fields() + if wanted(key)] + provider.fields = provider_fields + + @property + def fields_filter(self): + return self._fields_filter + + @fields_filter.setter + def fields_filter(self, fields_filter): + self._fields_filter = fields_filter + self.update_provider_filters() + def get(self): - for d in providers: - new = d.read() - for key in d.fields(): + for provider in self.providers: + new = provider.read() + for key in provider.fields: oldval = self.values.get(key, (0, 0)) - newval = new[key] + newval = new.get(key, 0) newdelta = None if oldval is not None: newdelta = newval - oldval[0] self.values[key] = (newval, newdelta) return self.values -if not os.access('/sys/kernel/debug', os.F_OK): - print 'Please enable CONFIG_DEBUG_FS in your kernel' - sys.exit(1) -if not os.access('/sys/kernel/debug/kvm', os.F_OK): - print "Please mount debugfs ('mount -t debugfs debugfs /sys/kernel/debug')" - print "and ensure the kvm modules are loaded" - sys.exit(1) - -label_width = 40 -number_width = 10 - -def tui(screen, stats): - curses.use_default_colors() - curses.noecho() - drilldown = False - fields_filter = stats.fields_filter - def update_drilldown(): - if not fields_filter: - if drilldown: - stats.set_fields_filter(None) - else: - stats.set_fields_filter(r'^[^\(]*$') - update_drilldown() - def refresh(sleeptime): - screen.erase() - screen.addstr(0, 0, 'kvm statistics') - screen.addstr(2, 1, 'Event') - screen.addstr(2, 1 + label_width + number_width - len('Total'), 'Total') - screen.addstr(2, 1 + label_width + number_width + 8 - len('Current'), 'Current') +LABEL_WIDTH = 40 +NUMBER_WIDTH = 10 + +class Tui(object): + def __init__(self, stats): + self.stats = stats + self.screen = None + self.drilldown = False + self.update_drilldown() + + def __enter__(self): + """Initialises curses for later use. Based on curses.wrapper + implementation from the Python standard library.""" + self.screen = curses.initscr() + curses.noecho() + curses.cbreak() + + # The try/catch works around a minor bit of + # over-conscientiousness in the curses module, the error + # return from C start_color() is ignorable. + try: + curses.start_color() + except: + pass + + curses.use_default_colors() + return self + + def __exit__(self, *exception): + """Resets the terminal to its normal state. Based on curses.wrappre + implementation from the Python standard library.""" + if self.screen: + self.screen.keypad(0) + curses.echo() + curses.nocbreak() + curses.endwin() + + def update_drilldown(self): + if not self.stats.fields_filter: + self.stats.fields_filter = r'^[^\(]*$' + + elif self.stats.fields_filter == r'^[^\(]*$': + self.stats.fields_filter = None + + def refresh(self, sleeptime): + self.screen.erase() + self.screen.addstr(0, 0, 'kvm statistics - summary', curses.A_BOLD) + self.screen.addstr(2, 1, 'Event') + self.screen.addstr(2, 1 + LABEL_WIDTH + NUMBER_WIDTH - + len('Total'), 'Total') + self.screen.addstr(2, 1 + LABEL_WIDTH + NUMBER_WIDTH + 8 - + len('Current'), 'Current') row = 3 - s = stats.get() + stats = self.stats.get() def sortkey(x): - if s[x][1]: - return (-s[x][1], -s[x][0]) + if stats[x][1]: + return (-stats[x][1], -stats[x][0]) else: - return (0, -s[x][0]) - for key in sorted(s.keys(), key = sortkey): - if row >= screen.getmaxyx()[0]: + return (0, -stats[x][0]) + for key in sorted(stats.keys(), key=sortkey): + + if row >= self.screen.getmaxyx()[0]: break - values = s[key] + values = stats[key] if not values[0] and not values[1]: break col = 1 - screen.addstr(row, col, key) - col += label_width - screen.addstr(row, col, '%10d' % (values[0],)) - col += number_width + self.screen.addstr(row, col, key) + col += LABEL_WIDTH + self.screen.addstr(row, col, '%10d' % (values[0],)) + col += NUMBER_WIDTH if values[1] is not None: - screen.addstr(row, col, '%8d' % (values[1] / sleeptime,)) + self.screen.addstr(row, col, '%8d' % (values[1] / sleeptime,)) row += 1 - screen.refresh() + self.screen.refresh() + + def show_filter_selection(self): + while True: + self.screen.erase() + self.screen.addstr(0, 0, + "Show statistics for events matching a regex.", + curses.A_BOLD) + self.screen.addstr(2, 0, + "Current regex: {0}" + .format(self.stats.fields_filter)) + self.screen.addstr(3, 0, "New regex: ") + curses.echo() + regex = self.screen.getstr() + curses.noecho() + if len(regex) == 0: + return + try: + re.compile(regex) + self.stats.fields_filter = regex + return + except re.error: + continue - sleeptime = 0.25 - while True: - refresh(sleeptime) - curses.halfdelay(int(sleeptime * 10)) - sleeptime = 3 - try: - c = screen.getkey() - if c == 'x': - drilldown = not drilldown - update_drilldown() - if c == 'q': + def show_stats(self): + sleeptime = 0.25 + while True: + self.refresh(sleeptime) + curses.halfdelay(int(sleeptime * 10)) + sleeptime = 3 + try: + char = self.screen.getkey() + if char == 'x': + self.drilldown = not self.drilldown + self.update_drilldown() + if char == 'q': + break + if char == 'f': + self.show_filter_selection() + except KeyboardInterrupt: break - except KeyboardInterrupt: - break - except curses.error: - continue + except curses.error: + continue def batch(stats): s = stats.get() @@ -568,13 +682,13 @@ def batch(stats): s = stats.get() for key in sorted(s.keys()): values = s[key] - print '%-22s%10d%10d' % (key, values[0], values[1]) + print '%-42s%10d%10d' % (key, values[0], values[1]) def log(stats): keys = sorted(stats.get().iterkeys()) def banner(): for k in keys: - print '%10s' % k[0:9], + print '%s' % k, print def statline(): s = stats.get() @@ -590,57 +704,110 @@ def log(stats): statline() line += 1 -options = optparse.OptionParser() -options.add_option('-1', '--once', '--batch', - action = 'store_true', - default = False, - dest = 'once', - help = 'run in batch mode for one second', - ) -options.add_option('-l', '--log', - action = 'store_true', - default = False, - dest = 'log', - help = 'run in logging mode (like vmstat)', - ) -options.add_option('-t', '--tracepoints', - action = 'store_true', - default = False, - dest = 'tracepoints', - help = 'retrieve statistics from tracepoints', - ) -options.add_option('-d', '--debugfs', - action = 'store_true', - default = False, - dest = 'debugfs', - help = 'retrieve statistics from debugfs', - ) -options.add_option('-f', '--fields', - action = 'store', - default = None, - dest = 'fields', - help = 'fields to display (regex)', - ) -(options, args) = options.parse_args(sys.argv) - -providers = [] -if options.tracepoints: - providers.append(TracepointProvider()) -if options.debugfs: - providers.append(DebugfsProvider()) - -if len(providers) == 0: - try: - providers = [TracepointProvider()] - except: - providers = [DebugfsProvider()] - -stats = Stats(providers, fields = options.fields) - -if options.log: - log(stats) -elif not options.once: - import curses.wrapper - curses.wrapper(tui, stats) -else: - batch(stats) +def get_options(): + description_text = """ +This script displays various statistics about VMs running under KVM. +The statistics are gathered from the KVM debugfs entries and / or the +currently available perf traces. + +The monitoring takes additional cpu cycles and might affect the VM's +performance. + +Requirements: +- Access to: + /sys/kernel/debug/kvm + /sys/kernel/debug/trace/events/* + /proc/pid/task +- /proc/sys/kernel/perf_event_paranoid < 1 if user has no + CAP_SYS_ADMIN and perf events are used. +- CAP_SYS_RESOURCE if the hard limit is not high enough to allow + the large number of files that are possibly opened. +""" + + class PlainHelpFormatter(optparse.IndentedHelpFormatter): + def format_description(self, description): + if description: + return description + "\n" + else: + return "" + + optparser = optparse.OptionParser(description=description_text, + formatter=PlainHelpFormatter()) + optparser.add_option('-1', '--once', '--batch', + action='store_true', + default=False, + dest='once', + help='run in batch mode for one second', + ) + optparser.add_option('-l', '--log', + action='store_true', + default=False, + dest='log', + help='run in logging mode (like vmstat)', + ) + optparser.add_option('-t', '--tracepoints', + action='store_true', + default=False, + dest='tracepoints', + help='retrieve statistics from tracepoints', + ) + optparser.add_option('-d', '--debugfs', + action='store_true', + default=False, + dest='debugfs', + help='retrieve statistics from debugfs', + ) + optparser.add_option('-f', '--fields', + action='store', + default=None, + dest='fields', + help='fields to display (regex)', + ) + (options, _) = optparser.parse_args(sys.argv) + return options + +def get_providers(options): + providers = [] + + if options.tracepoints: + providers.append(TracepointProvider()) + if options.debugfs: + providers.append(DebugfsProvider()) + if len(providers) == 0: + providers.append(TracepointProvider()) + + return providers + +def check_access(): + if not os.path.exists('/sys/kernel/debug'): + sys.stderr.write('Please enable CONFIG_DEBUG_FS in your kernel.') + sys.exit(1) + + if not os.path.exists(PATH_DEBUGFS_KVM): + sys.stderr.write("Please make sure, that debugfs is mounted and " + "readable by the current user:\n" + "('mount -t debugfs debugfs /sys/kernel/debug')\n" + "Also ensure, that the kvm modules are loaded.\n") + sys.exit(1) + + if not os.path.exists(PATH_DEBUGFS_TRACING): + sys.stderr.write("Please make {0} readable by the current user.\n" + .format(PATH_DEBUGFS_TRACING)) + sys.exit(1) + +def main(): + check_access() + options = get_options() + providers = get_providers(options) + stats = Stats(providers, fields=options.fields) + + if options.log: + log(stats) + elif not options.once: + with Tui(stats) as tui: + tui.show_stats() + else: + batch(stats) + +if __name__ == "__main__": + main() |