libdrgn: aarch64: Rework page table walker to only read one PTE per level

The current page table walker will on average read around half of the
entire page table for each level. This is inefficient, especially when
debugging a remote target which may have a low bandwidth connection to
the debugger. Address this by only reading one PTE per level.

I've only done the aarch64 page table walker because that's all that I
needed, but in principle the other page table walkers could work in a
similar way.

Signed-off-by: Peter Collingbourne <pcc@google.com>
This commit is contained in:
Peter Collingbourne 2023-06-27 16:54:34 -07:00 committed by Omar Sandoval
parent 79a1ea2a33
commit e99921d77b

View File

@ -7,6 +7,7 @@
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#include "array.h"
#include "error.h" #include "error.h"
#include "platform.h" // IWYU pragma: associated #include "platform.h" // IWYU pragma: associated
#include "program.h" #include "program.h"
@ -269,8 +270,8 @@ struct pgtable_iterator_aarch64 {
int levels; int levels;
uint16_t entries_per_level; uint16_t entries_per_level;
uint16_t last_level_num_entries; uint16_t last_level_num_entries;
uint16_t *index; uint64_t cached_virt_addr;
uint64_t *table; uint64_t table[5];
uint64_t pa_low_mask; uint64_t pa_low_mask;
uint64_t pa_high_mask; uint64_t pa_high_mask;
}; };
@ -328,19 +329,11 @@ linux_kernel_pgtable_iterator_create_aarch64(struct drgn_program *prog,
it->levels = ((va_bits - page_shift + pgtable_shift - 1) / it->levels = ((va_bits - page_shift + pgtable_shift - 1) /
pgtable_shift); pgtable_shift);
assert(it->levels <= array_size(it->table));
it->entries_per_level = 1 << pgtable_shift; it->entries_per_level = 1 << pgtable_shift;
it->last_level_num_entries = it->last_level_num_entries =
1 << ((va_bits - page_shift - 1) % pgtable_shift + 1); 1 << ((va_bits - page_shift - 1) % pgtable_shift + 1);
it->index = malloc_array(it->levels, sizeof(it->index[0]));
if (!it->index)
goto err_it;
it->table = malloc_array((size_t)(it->levels - 1) * it->entries_per_level
+ it->last_level_num_entries,
sizeof(it->table[0]));
if (!it->table)
goto err_index;
// Descriptor bits [47:PAGE_SHIFT] contain physical address bits // Descriptor bits [47:PAGE_SHIFT] contain physical address bits
// [47:PAGE_SHIFT]. // [47:PAGE_SHIFT].
// //
@ -364,20 +357,12 @@ linux_kernel_pgtable_iterator_create_aarch64(struct drgn_program *prog,
*ret = &it->it; *ret = &it->it;
return NULL; return NULL;
err_index:
free(it->index);
err_it:
free(it);
return &drgn_enomem;
} }
static void linux_kernel_pgtable_iterator_destroy_aarch64(struct pgtable_iterator *_it) static void linux_kernel_pgtable_iterator_destroy_aarch64(struct pgtable_iterator *_it)
{ {
struct pgtable_iterator_aarch64 *it = struct pgtable_iterator_aarch64 *it =
container_of(_it, struct pgtable_iterator_aarch64, it); container_of(_it, struct pgtable_iterator_aarch64, it);
free(it->table);
free(it->index);
free(it); free(it);
} }
@ -394,7 +379,9 @@ static void linux_kernel_pgtable_iterator_init_aarch64(struct drgn_program *prog
it->va_range_max = it->va_range_max =
(UINT64_C(1) << prog->vmcoreinfo.va_bits) - 1; (UINT64_C(1) << prog->vmcoreinfo.va_bits) - 1;
} }
memset(it->index, 0xff, it->levels * sizeof(it->index[0]));
it->cached_virt_addr = 0;
memset(it->table, 0, sizeof(it->table));
} }
static struct drgn_error * static struct drgn_error *
@ -410,71 +397,59 @@ linux_kernel_pgtable_iterator_next_aarch64(struct drgn_program *prog,
struct pgtable_iterator_aarch64 *it = struct pgtable_iterator_aarch64 *it =
container_of(_it, struct pgtable_iterator_aarch64, it); container_of(_it, struct pgtable_iterator_aarch64, it);
const uint64_t virt_addr = it->it.virt_addr; const uint64_t virt_addr = it->it.virt_addr;
int level;
// Find the lowest level with cached entries. if (virt_addr < it->va_range_min || virt_addr > it->va_range_max) {
for (level = 0; level < it->levels - 1; level++) { *virt_addr_ret = it->va_range_min;
if (it->index[level] < it->entries_per_level) *phys_addr_ret = UINT64_MAX;
break; it->it.virt_addr = it->va_range_max + 1;
return NULL;
} }
if (level == it->levels - 1 &&
it->index[level] >= it->last_level_num_entries) uint16_t num_entries = it->last_level_num_entries;
level++; uint64_t table = it->it.pgtable;
// For every level below that, refill the cache/return pages. bool table_physical = false;
for (;; level--) { for (int level = it->levels;; level--) {
uint16_t num_entries; uint8_t level_shift = page_shift + pgtable_shift * (level - 1);
uint64_t table; uint16_t index = (virt_addr >> level_shift) & (num_entries - 1);
bool table_physical; uint16_t cached_index = (it->cached_virt_addr >> level_shift) &
if (level == it->levels) { (num_entries - 1);
num_entries = it->last_level_num_entries; if (index != cached_index)
if (virt_addr < it->va_range_min || memset(it->table, 0, 8 * level);
virt_addr > it->va_range_max) { uint64_t *entry_ptr = &it->table[level - 1];
*virt_addr_ret = it->va_range_min; if (!*entry_ptr) {
*phys_addr_ret = UINT64_MAX; err = drgn_program_read_memory(prog, entry_ptr,
it->it.virt_addr = it->va_range_max + 1; table + 8 * index, 8,
return NULL; table_physical);
} if (err)
table = it->it.pgtable; return err;
table_physical = false;
} else {
num_entries = it->entries_per_level;
uint64_t entry = it->table[level * num_entries + it->index[level]++];
if (bswap) if (bswap)
entry = bswap_64(entry); *entry_ptr = bswap_64(*entry_ptr);
table = ((entry & it->pa_low_mask) |
(entry & it->pa_high_mask) << 36);
// Descriptor bits [1:0] identify the descriptor type:
//
// 0x0, 0x2: invalid
// 0x1: lowest level: reserved, invalid
// higher levels: block
// 0x3: lowest level: page
// higher levels: table
if ((entry & 0x3) != 0x3 || level == 0) {
uint64_t mask = (UINT64_C(1) <<
(page_shift +
pgtable_shift * level)) - 1;
*virt_addr_ret = virt_addr & ~mask;
if ((entry & 0x3) == (level == 0 ? 0x3 : 0x1))
*phys_addr_ret = table & ~mask;
else
*phys_addr_ret = UINT64_MAX;
it->it.virt_addr = (virt_addr | mask) + 1;
return NULL;
}
table_physical = true;
} }
uint16_t index = ((virt_addr >> uint64_t entry = *entry_ptr;
(page_shift + pgtable_shift * (level - 1)))
& (num_entries - 1)); num_entries = it->entries_per_level;
err = drgn_program_read_memory(prog, table = ((entry & it->pa_low_mask) |
&it->table[(level - 1) * it->entries_per_level + index], (entry & it->pa_high_mask) << 36);
table + 8 * index,
8 * (num_entries - index), // Descriptor bits [1:0] identify the descriptor type:
table_physical); //
if (err) // 0x0, 0x2: invalid
return err; // 0x1: lowest level: reserved, invalid
it->index[level - 1] = index; // higher levels: block
// 0x3: lowest level: page
// higher levels: table
if ((entry & 0x3) != 0x3 || level == 1) {
uint64_t mask = (UINT64_C(1) << level_shift) - 1;
*virt_addr_ret = virt_addr & ~mask;
if ((entry & 0x3) == (level == 1 ? 0x3 : 0x1))
*phys_addr_ret = table & ~mask;
else
*phys_addr_ret = UINT64_MAX;
it->cached_virt_addr = virt_addr;
it->it.virt_addr = (virt_addr | mask) + 1;
return NULL;
}
table_physical = true;
} }
} }