Browse Source

migration/postcopy: Report fault latencies in blocktime

Blocktime so far only cares about the time one vcpu (or the whole system)
got blocked.  It would be also be helpful if it can also report the latency
of page requests, which could be very sensitive during postcopy.

Blocktime itself is sometimes not very important, especially when one
thinks about KVM async PF support, which means vCPUs are literally almost
not blocked at all because the guest OS is smart enough to switch to
another task when a remote fault is needed.

However, latency is still sensitive and important because even if the guest
vCPU is running on threads that do not need a remote fault, the workload
that accesses some missing page is still affected.

Add two entries to the report, showing how long it takes to resolve a
remote fault.  Mention in the QAPI doc that this is not the real average
fault latency, but only the ones that was requested for a remote fault.

Unwrap get_vcpu_blocktime_list() so we don't need to walk the list twice,
meanwhile add the entry checks in qtests for all postcopy tests.

Cc: Markus Armbruster <armbru@redhat.com>
Cc: Dr. David Alan Gilbert <dave@treblig.org>
Reviewed-by: Fabiano Rosas <farosas@suse.de>
Tested-by: Mario Casquero <mcasquer@redhat.com>
Link: https://lore.kernel.org/r/20250613141217.474825-9-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Fabiano Rosas <farosas@suse.de>
pull/294/head
Peter Xu 10 months ago
committed by Fabiano Rosas
parent
commit
b4c82b4288
  1. 67
      migration/migration-hmp-cmds.c
  2. 49
      migration/postcopy-ram.c
  3. 20
      qapi/migration.json
  4. 3
      tests/qtest/migration/migration-qmp.c

67
migration/migration-hmp-cmds.c

@ -52,6 +52,51 @@ static void migration_global_dump(Monitor *mon)
ms->clear_bitmap_shift); ms->clear_bitmap_shift);
} }
static void migration_dump_blocktime(Monitor *mon, MigrationInfo *info)
{
if (info->has_postcopy_blocktime) {
monitor_printf(mon, "Postcopy Blocktime (ms): %" PRIu32 "\n",
info->postcopy_blocktime);
}
if (info->has_postcopy_vcpu_blocktime) {
uint32List *item = info->postcopy_vcpu_blocktime;
const char *sep = "";
int count = 0;
monitor_printf(mon, "Postcopy vCPU Blocktime (ms):\n [");
while (item) {
monitor_printf(mon, "%s%"PRIu32, sep, item->value);
item = item->next;
/* Each line 10 vcpu results, newline if there's more */
sep = ((++count % 10 == 0) && item) ? ",\n " : ", ";
}
monitor_printf(mon, "]\n");
}
if (info->has_postcopy_latency) {
monitor_printf(mon, "Postcopy Latency (ns): %" PRIu64 "\n",
info->postcopy_latency);
}
if (info->has_postcopy_vcpu_latency) {
uint64List *item = info->postcopy_vcpu_latency;
const char *sep = "";
int count = 0;
monitor_printf(mon, "Postcopy vCPU Latencies (ns):\n [");
while (item) {
monitor_printf(mon, "%s%"PRIu64, sep, item->value);
item = item->next;
/* Each line 10 vcpu results, newline if there's more */
sep = ((++count % 10 == 0) && item) ? ",\n " : ", ";
}
monitor_printf(mon, "]\n");
}
}
void hmp_info_migrate(Monitor *mon, const QDict *qdict) void hmp_info_migrate(Monitor *mon, const QDict *qdict)
{ {
bool show_all = qdict_get_try_bool(qdict, "all", false); bool show_all = qdict_get_try_bool(qdict, "all", false);
@ -202,27 +247,7 @@ void hmp_info_migrate(Monitor *mon, const QDict *qdict)
info->dirty_limit_ring_full_time); info->dirty_limit_ring_full_time);
} }
if (info->has_postcopy_blocktime) { migration_dump_blocktime(mon, info);
monitor_printf(mon, "Postcopy Blocktime (ms): %" PRIu32 "\n",
info->postcopy_blocktime);
}
if (info->has_postcopy_vcpu_blocktime) {
uint32List *item = info->postcopy_vcpu_blocktime;
const char *sep = "";
int count = 0;
monitor_printf(mon, "Postcopy vCPU Blocktime (ms):\n [");
while (item) {
monitor_printf(mon, "%s%"PRIu32, sep, item->value);
item = item->next;
/* Each line 10 vcpu results, newline if there's more */
sep = ((++count % 10 == 0) && item) ? ",\n " : ", ";
}
monitor_printf(mon, "]\n");
}
out: out:
qapi_free_MigrationInfo(info); qapi_free_MigrationInfo(info);
} }

49
migration/postcopy-ram.c

@ -166,21 +166,6 @@ static struct PostcopyBlocktimeContext *blocktime_context_new(void)
return ctx; return ctx;
} }
static uint32List *get_vcpu_blocktime_list(PostcopyBlocktimeContext *ctx)
{
MachineState *ms = MACHINE(qdev_get_machine());
uint32List *list = NULL;
int i;
for (i = ms->smp.cpus - 1; i >= 0; i--) {
/* Convert ns -> ms */
QAPI_LIST_PREPEND(
list, (uint32_t)(ctx->vcpu_blocktime_total[i] / SCALE_MS));
}
return list;
}
/* /*
* This function just populates MigrationInfo from postcopy's * This function just populates MigrationInfo from postcopy's
* blocktime context. It will not populate MigrationInfo, * blocktime context. It will not populate MigrationInfo,
@ -192,16 +177,48 @@ void fill_destination_postcopy_migration_info(MigrationInfo *info)
{ {
MigrationIncomingState *mis = migration_incoming_get_current(); MigrationIncomingState *mis = migration_incoming_get_current();
PostcopyBlocktimeContext *bc = mis->blocktime_ctx; PostcopyBlocktimeContext *bc = mis->blocktime_ctx;
MachineState *ms = MACHINE(qdev_get_machine());
uint64_t latency_total = 0, faults = 0;
uint32List *list_blocktime = NULL;
uint64List *list_latency = NULL;
int i;
if (!bc) { if (!bc) {
return; return;
} }
for (i = ms->smp.cpus - 1; i >= 0; i--) {
uint64_t latency, total, count;
/* Convert ns -> ms */
QAPI_LIST_PREPEND(list_blocktime,
(uint32_t)(bc->vcpu_blocktime_total[i] / SCALE_MS));
/* The rest in nanoseconds */
total = bc->vcpu_blocktime_total[i];
latency_total += total;
count = bc->vcpu_faults_count[i];
faults += count;
if (count) {
latency = total / count;
} else {
/* No fault detected */
latency = 0;
}
QAPI_LIST_PREPEND(list_latency, latency);
}
info->has_postcopy_blocktime = true; info->has_postcopy_blocktime = true;
/* Convert ns -> ms */ /* Convert ns -> ms */
info->postcopy_blocktime = (uint32_t)(bc->total_blocktime / SCALE_MS); info->postcopy_blocktime = (uint32_t)(bc->total_blocktime / SCALE_MS);
info->has_postcopy_vcpu_blocktime = true; info->has_postcopy_vcpu_blocktime = true;
info->postcopy_vcpu_blocktime = get_vcpu_blocktime_list(bc); info->postcopy_vcpu_blocktime = list_blocktime;
info->has_postcopy_latency = true;
info->postcopy_latency = faults ? (latency_total / faults) : 0;
info->has_postcopy_vcpu_latency = true;
info->postcopy_vcpu_latency = list_latency;
} }
static uint64_t get_postcopy_total_blocktime(void) static uint64_t get_postcopy_total_blocktime(void)

20
qapi/migration.json

@ -236,6 +236,17 @@
# This is only present when the postcopy-blocktime migration # This is only present when the postcopy-blocktime migration
# capability is enabled. (Since 3.0) # capability is enabled. (Since 3.0)
# #
# @postcopy-latency: average remote page fault latency (in ns). Note that
# this doesn't include all faults, but only the ones that require a
# remote page request. So it should be always bigger than the real
# average page fault latency. This is only present when the
# postcopy-blocktime migration capability is enabled. (Since 10.1)
#
# @postcopy-vcpu-latency: average remote page fault latency per vCPU (in
# ns). It has the same definition of @postcopy-latency, but instead
# this is the per-vCPU statistics. This is only present when the
# postcopy-blocktime migration capability is enabled. (Since 10.1)
#
# @socket-address: Only used for tcp, to know what the real port is # @socket-address: Only used for tcp, to know what the real port is
# (Since 4.0) # (Since 4.0)
# #
@ -260,6 +271,11 @@
# average memory load of the virtual CPU indirectly. Note that # average memory load of the virtual CPU indirectly. Note that
# zero means guest doesn't dirty memory. (Since 8.1) # zero means guest doesn't dirty memory. (Since 8.1)
# #
# Features:
#
# @unstable: Members @postcopy-latency, @postcopy-vcpu-latency are
# experimental.
#
# Since: 0.14 # Since: 0.14
## ##
{ 'struct': 'MigrationInfo', { 'struct': 'MigrationInfo',
@ -275,6 +291,10 @@
'*blocked-reasons': ['str'], '*blocked-reasons': ['str'],
'*postcopy-blocktime': 'uint32', '*postcopy-blocktime': 'uint32',
'*postcopy-vcpu-blocktime': ['uint32'], '*postcopy-vcpu-blocktime': ['uint32'],
'*postcopy-latency': {
'type': 'uint64', 'features': [ 'unstable' ] },
'*postcopy-vcpu-latency': {
'type': ['uint64'], 'features': [ 'unstable' ] },
'*socket-address': ['SocketAddress'], '*socket-address': ['SocketAddress'],
'*dirty-limit-throttle-time-per-round': 'uint64', '*dirty-limit-throttle-time-per-round': 'uint64',
'*dirty-limit-ring-full-time': 'uint64'} } '*dirty-limit-ring-full-time': 'uint64'} }

3
tests/qtest/migration/migration-qmp.c

@ -358,6 +358,9 @@ void read_blocktime(QTestState *who)
rsp_return = migrate_query_not_failed(who); rsp_return = migrate_query_not_failed(who);
g_assert(qdict_haskey(rsp_return, "postcopy-blocktime")); g_assert(qdict_haskey(rsp_return, "postcopy-blocktime"));
g_assert(qdict_haskey(rsp_return, "postcopy-vcpu-blocktime"));
g_assert(qdict_haskey(rsp_return, "postcopy-latency"));
g_assert(qdict_haskey(rsp_return, "postcopy-vcpu-latency"));
qobject_unref(rsp_return); qobject_unref(rsp_return);
} }

Loading…
Cancel
Save