Skip to content

Commit 8e3d097

Browse files
authored
Merge pull request #2788 from mseri/create-gm-on-start-refactored
Create VM guest metrics on VM start
2 parents cad6ac6 + 9ca1f62 commit 8e3d097

File tree

3 files changed

+153
-99
lines changed

3 files changed

+153
-99
lines changed

ocaml/xapi/xapi_guest_agent.ml

Lines changed: 54 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -128,10 +128,10 @@ let memory_targets : (int, int64) Hashtbl.t = Hashtbl.create 20
128128
let dead_domains : IntSet.t ref = ref IntSet.empty
129129
let mutex = Mutex.create ()
130130

131-
(** Reset all the guest metrics for a particular VM. 'lookup' reads a key from xenstore
132-
and 'list' reads a directory from xenstore. Both are relative to the guest's
133-
domainpath. *)
134-
let all (lookup: string -> string option) (list: string -> string list) ~__context ~domid ~uuid =
131+
132+
(* In the following functions, 'lookup' reads a key from xenstore and 'list' reads
133+
a directory from xenstore. Both are relative to the guest's domainpath. *)
134+
let get_initial_guest_metrics (lookup: string -> string option) (list: string -> string list) =
135135
let all_control = list "control" in
136136
let to_map kvpairs = List.concat (List.map (fun (xskey, mapkey) -> match lookup xskey with
137137
| Some xsval -> [ mapkey, xsval ]
@@ -156,15 +156,58 @@ let all (lookup: string -> string option) (list: string -> string list) ~__conte
156156
and last_updated = Unix.gettimeofday () in
157157
let can_use_hotplug_vbd = get_tristate "feature/hotplug/vbd" in
158158
let can_use_hotplug_vif = get_tristate "feature/hotplug/vif" in
159-
160-
(* let num = Mutex.execute mutex (fun () -> Hashtbl.fold (fun _ _ c -> 1 + c) cache 0) in
161-
debug "Number of entries in hashtbl: %d" num; *)
162-
163159
(* to avoid breakage whilst 'micro' is added to linux and windows agents, default this field
164160
to -1 if it's not present in xenstore *)
165161
let pv_drivers_version =
166162
if List.mem_assoc "micro" pv_drivers_version then pv_drivers_version (* already there; do nothing *)
167-
else ("micro","-1")::pv_drivers_version in
163+
else ("micro","-1")::pv_drivers_version
164+
in
165+
{pv_drivers_version; os_version; networks; other; memory; device_id; last_updated; can_use_hotplug_vbd; can_use_hotplug_vif;}
166+
167+
168+
let create_and_set_guest_metrics (lookup: string -> string option) (list: string -> string list) ~__context ~domid ~uuid =
169+
let initial_gm = get_initial_guest_metrics lookup list
170+
in
171+
let self = Db.VM.get_by_uuid ~__context ~uuid in
172+
let new_gm_uuid = (Uuid.to_string (Uuid.make_uuid ()))
173+
and new_gm_ref = Ref.make () in
174+
Db.VM_guest_metrics.create ~__context
175+
~ref:new_gm_ref
176+
~uuid:new_gm_uuid
177+
~os_version:initial_gm.os_version
178+
~pV_drivers_version:initial_gm.pv_drivers_version
179+
~pV_drivers_up_to_date:false
180+
~memory:[] ~disks:[]
181+
~networks:initial_gm.networks
182+
~pV_drivers_detected:false
183+
~other:initial_gm.other
184+
~last_updated:(Date.of_float initial_gm.last_updated)
185+
~other_config:[]
186+
~live:true
187+
~can_use_hotplug_vbd:initial_gm.can_use_hotplug_vbd
188+
~can_use_hotplug_vif:initial_gm.can_use_hotplug_vif
189+
;
190+
Db.VM.set_guest_metrics ~__context ~self ~value:new_gm_ref;
191+
192+
(* Update the cache with the new values *)
193+
Mutex.execute mutex (fun () -> Hashtbl.replace cache domid initial_gm);
194+
(* We've just set the thing to live, let's make sure it's not in the dead list *)
195+
Mutex.execute mutex (fun () -> dead_domains := IntSet.remove domid !dead_domains);
196+
197+
let sl xs = String.concat "; " (List.map (fun (k, v) -> k ^ ": " ^ v) xs) in
198+
info "Received initial update from guest agent in VM %s; os_version = [ %s ]; pv_drivers_version = [ %s ]; networks = [ %s ]"
199+
(Ref.string_of self) (sl initial_gm.os_version) (sl initial_gm.pv_drivers_version) (sl initial_gm.networks);
200+
new_gm_ref
201+
202+
203+
(** Reset all the guest metrics for a particular VM *)
204+
let all (lookup: string -> string option) (list: string -> string list) ~__context ~domid ~uuid =
205+
let {pv_drivers_version; os_version; networks; other; memory; device_id;
206+
last_updated; can_use_hotplug_vbd; can_use_hotplug_vif;} = get_initial_guest_metrics lookup list
207+
in
208+
209+
(* let num = Mutex.execute mutex (fun () -> Hashtbl.fold (fun _ _ c -> 1 + c) cache 0) in
210+
debug "Number of entries in hashtbl: %d" num; *)
168211

169212
let self = Db.VM.get_by_uuid ~__context ~uuid in
170213

@@ -228,16 +271,8 @@ let all (lookup: string -> string option) (list: string -> string list) ~__conte
228271
then existing
229272
else
230273
(* if it doesn't exist, make a fresh one *)
231-
let new_ref = Ref.make () and new_uuid = Uuid.to_string (Uuid.make_uuid ()) in
232-
Db.VM_guest_metrics.create ~__context ~ref:new_ref ~uuid:new_uuid
233-
~os_version:os_version ~pV_drivers_version:pv_drivers_version ~pV_drivers_up_to_date:false ~memory:[] ~disks:[] ~networks:networks ~other:other
234-
~pV_drivers_detected:false ~last_updated:(Date.of_float last_updated) ~other_config:[] ~live:true ~can_use_hotplug_vbd:`unspecified ~can_use_hotplug_vif:`unspecified;
235-
Db.VM.set_guest_metrics ~__context ~self ~value:new_ref;
236-
(* We've just set the thing to live, let's make sure it's not in the dead list *)
237-
let sl xs = String.concat "; " (List.map (fun (k, v) -> k ^ ": " ^ v) xs) in
238-
info "Received initial update from guest agent in VM %s; os_version = [ %s]; pv_drivers_version = [ %s ]; networks = [ %s ]" (Ref.string_of self) (sl os_version) (sl pv_drivers_version) (sl networks);
239-
Mutex.execute mutex (fun () -> dead_domains := IntSet.remove domid !dead_domains);
240-
new_ref in
274+
create_and_set_guest_metrics lookup list ~__context ~domid ~uuid
275+
in
241276

242277
(* We unconditionally reset the database values but observe that the database
243278
checks whether a value has actually changed before doing anything *)

ocaml/xapi/xapi_vm.ml

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -241,9 +241,7 @@ let start ~__context ~vm ~start_paused ~force =
241241
(* Clear out any VM guest metrics record. Guest metrics will be updated by
242242
* the running VM and for now they might be wrong, especially network
243243
* addresses inherited by a cloned VM. *)
244-
let vm_gm = Db.VM.get_guest_metrics ~__context ~self:vm in
245-
Db.VM.set_guest_metrics ~__context ~self:vm ~value:Ref.null;
246-
(try Db.VM_guest_metrics.destroy ~__context ~self:vm_gm with _ -> ());
244+
Xapi_vm_helpers.delete_guest_metrics ~__context ~self:vm;
247245

248246
(* This makes sense here while the available versions are 0, 1 and 2.
249247
* If/when we introduce another version, we must reassess this. *)
@@ -253,7 +251,7 @@ let start ~__context ~vm ~start_paused ~force =
253251
Cpuid_helpers.reset_cpu_flags ~__context ~vm;
254252

255253
(* If the VM has any vGPUs, gpumon must remain stopped until the
256-
* VM has started. *)
254+
* VM has started. *)
257255
begin
258256
match vmr.API.vM_VGPUs with
259257
| [] -> Xapi_xenops.start ~__context ~self:vm start_paused force

ocaml/xapi/xapi_xenops.ml

Lines changed: 97 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -1276,22 +1276,80 @@ let update_vm ~__context id =
12761276
let a = Opt.map (fun x -> f (snd x)) info in
12771277
let b = Opt.map f previous in
12781278
a <> b in
1279+
(* Helpers to create and update guest metrics when needed *)
1280+
let lookup state key =
1281+
if List.mem_assoc key state.guest_agent then Some (List.assoc key state.guest_agent) else None in
1282+
let list state dir =
1283+
let dir = if dir.[0] = '/' then String.sub dir 1 (String.length dir - 1) else dir in
1284+
let results = Listext.List.filter_map (fun (path, value) ->
1285+
if String.startswith dir path then begin
1286+
let rest = String.sub path (String.length dir) (String.length path - (String.length dir)) in
1287+
match List.filter (fun x -> x <> "") (String.split '/' rest) with
1288+
| x :: _ -> Some x
1289+
| _ -> None
1290+
end else None
1291+
) state.guest_agent |> Listext.List.setify in
1292+
results in
1293+
let create_guest_metrics_if_needed () =
1294+
let gm = Db.VM.get_guest_metrics ~__context ~self in
1295+
if gm = Ref.null then
1296+
Opt.iter
1297+
(fun (_, state) ->
1298+
List.iter
1299+
(fun domid ->
1300+
try
1301+
let new_gm_ref = Xapi_guest_agent.create_and_set_guest_metrics (lookup state) (list state) ~__context ~domid ~uuid:id in
1302+
debug "xenopsd event: created guest metrics %s for VM %s" (Ref.string_of new_gm_ref) id
1303+
with e ->
1304+
error "Caught %s: while creating VM %s guest metrics" (Printexc.to_string e) id
1305+
) state.domids
1306+
) info in
1307+
let check_guest_agent () =
1308+
Opt.iter
1309+
(fun (_, state) ->
1310+
Opt.iter (fun oldstate ->
1311+
let old_ga = oldstate.guest_agent in
1312+
let new_ga = state.guest_agent in
1313+
1314+
(* Remove memory keys *)
1315+
let ignored_keys = [ "data/meminfo_free"; "data/updated"; "data/update_cnt" ] in
1316+
let remove_ignored ga =
1317+
List.fold_left (fun acc k -> List.filter (fun x -> fst x <> k) acc) ga ignored_keys in
1318+
let old_ga = remove_ignored old_ga in
1319+
let new_ga = remove_ignored new_ga in
1320+
if new_ga <> old_ga then begin
1321+
debug "Will update VM.allowed_operations because guest_agent has changed.";
1322+
should_update_allowed_operations := true
1323+
end else begin
1324+
debug "Supressing VM.allowed_operations update because guest_agent data is largely the same"
1325+
end
1326+
) previous;
1327+
List.iter
1328+
(fun domid ->
1329+
try
1330+
debug "xenopsd event: Updating VM %s domid %d guest_agent" id domid;
1331+
Xapi_guest_agent.all (lookup state) (list state) ~__context ~domid ~uuid:id
1332+
with e ->
1333+
error "Caught %s: while updating VM %s guest_agent" (Printexc.to_string e) id
1334+
) state.domids
1335+
) info in
12791336
(* Notes on error handling: if something fails we log and continue, to
1280-
maximise the amount of state which is correctly synced. If something
1281-
does fail then we may end up permanently out-of-sync until either a
1282-
process restart or an event is generated. We may wish to periodically
1283-
inject artificial events IF there has been an event sync failure? *)
1337+
maximise the amount of state which is correctly synced. If something
1338+
does fail then we may end up permanently out-of-sync until either a
1339+
process restart or an event is generated. We may wish to periodically
1340+
inject artificial events IF there has been an event sync failure? *)
12841341
if different (fun x -> x.power_state) then begin
12851342
try
12861343
debug "Will update VM.allowed_operations because power_state has changed.";
12871344
should_update_allowed_operations := true;
12881345
let power_state = xenapi_of_xenops_power_state (Opt.map (fun x -> (snd x).power_state) info) in
12891346
debug "xenopsd event: Updating VM %s power_state <- %s" id (Record_util.power_state_to_string power_state);
12901347
(* This will mark VBDs, VIFs as detached and clear resident_on
1291-
if the VM has permanently shutdown. current-operations
1292-
should not be reset as there maybe a checkpoint is ongoing*)
1348+
if the VM has permanently shutdown. current-operations
1349+
should not be reset as there maybe a checkpoint is ongoing*)
12931350
Xapi_vm_lifecycle.force_state_reset_keep_current_operations ~__context ~self ~value:power_state;
12941351

1352+
if power_state = `Running then create_guest_metrics_if_needed ();
12951353
if power_state = `Suspended || power_state = `Halted then begin
12961354
Xapi_network.detach_for_vm ~__context ~host:localhost ~vm:self;
12971355
Storage_access.reset ~__context ~vm:self;
@@ -1406,48 +1464,6 @@ let update_vm ~__context id =
14061464
with e ->
14071465
error "Caught %s: while updating VM %s rtc/timeoffset" (Printexc.to_string e) id
14081466
end;
1409-
let check_guest_agent () =
1410-
Opt.iter
1411-
(fun (_, state) ->
1412-
Opt.iter (fun oldstate ->
1413-
let old_ga = oldstate.guest_agent in
1414-
let new_ga = state.guest_agent in
1415-
1416-
(* Remove memory keys *)
1417-
let ignored_keys = [ "data/meminfo_free"; "data/updated"; "data/update_cnt" ] in
1418-
let remove_ignored ga =
1419-
List.fold_left (fun acc k -> List.filter (fun x -> fst x <> k) acc) ga ignored_keys in
1420-
let old_ga = remove_ignored old_ga in
1421-
let new_ga = remove_ignored new_ga in
1422-
if new_ga <> old_ga then begin
1423-
debug "Will update VM.allowed_operations because guest_agent has changed.";
1424-
should_update_allowed_operations := true
1425-
end else begin
1426-
debug "Supressing VM.allowed_operations update because guest_agent data is largely the same"
1427-
end
1428-
) previous;
1429-
List.iter
1430-
(fun domid ->
1431-
let lookup key =
1432-
if List.mem_assoc key state.guest_agent then Some (List.assoc key state.guest_agent) else None in
1433-
let list dir =
1434-
let dir = if dir.[0] = '/' then String.sub dir 1 (String.length dir - 1) else dir in
1435-
let results = Listext.List.filter_map (fun (path, value) ->
1436-
if String.startswith dir path then begin
1437-
let rest = String.sub path (String.length dir) (String.length path - (String.length dir)) in
1438-
match List.filter (fun x -> x <> "") (String.split '/' rest) with
1439-
| x :: _ -> Some x
1440-
| _ -> None
1441-
end else None
1442-
) state.guest_agent |> Listext.List.setify in
1443-
results in
1444-
try
1445-
debug "xenopsd event: Updating VM %s domid %d guest_agent" id domid;
1446-
Xapi_guest_agent.all lookup list ~__context ~domid ~uuid:id
1447-
with e ->
1448-
error "Caught %s: while updating VM %s guest_agent" (Printexc.to_string e) id
1449-
) state.domids
1450-
) info in
14511467
if different (fun x -> x.hvm) then begin
14521468
Opt.iter
14531469
(fun (_, state) ->
@@ -1484,15 +1500,44 @@ let update_vm ~__context id =
14841500
let update_pv_drivers_detected () =
14851501
Opt.iter
14861502
(fun (_, state) ->
1487-
let gm = Db.VM.get_guest_metrics ~__context ~self in
1488-
debug "xenopsd event: Updating VM %s PV drivers detected %b" id state.pv_drivers_detected;
1489-
Db.VM_guest_metrics.set_PV_drivers_detected ~__context ~self:gm ~value:state.pv_drivers_detected;
1490-
Db.VM_guest_metrics.set_PV_drivers_up_to_date ~__context ~self:gm ~value:state.pv_drivers_detected
1503+
try
1504+
let gm = Db.VM.get_guest_metrics ~__context ~self in
1505+
debug "xenopsd event: Updating VM %s PV drivers detected %b" id state.pv_drivers_detected;
1506+
Db.VM_guest_metrics.set_PV_drivers_detected ~__context ~self:gm ~value:state.pv_drivers_detected;
1507+
Db.VM_guest_metrics.set_PV_drivers_up_to_date ~__context ~self:gm ~value:state.pv_drivers_detected
1508+
with e ->
1509+
debug "Caught %s: while updating VM %s PV drivers" (Printexc.to_string e) id
14911510
) info in
1511+
(* Chack last_start_time before updating anything in the guest metrics *)
1512+
if different (fun x -> x.last_start_time) then begin
1513+
try
1514+
Opt.iter
1515+
(fun (_, state) ->
1516+
debug "xenopsd event: Updating VM %s last_start_time <- %s" id (Date.to_string (Date.of_float state.last_start_time));
1517+
let metrics = Db.VM.get_metrics ~__context ~self in
1518+
let start_time = Date.of_float state.last_start_time in
1519+
Db.VM_metrics.set_start_time ~__context ~self:metrics ~value:start_time;
1520+
1521+
create_guest_metrics_if_needed ();
1522+
let gm = Db.VM.get_guest_metrics ~__context ~self in
1523+
let update_time = Db.VM_guest_metrics.get_last_updated ~__context ~self:gm in
1524+
if update_time < start_time then begin
1525+
debug "VM %s guest metrics update time (%s) < VM start time (%s): deleting"
1526+
id (Date.to_string update_time) (Date.to_string start_time);
1527+
Xapi_vm_helpers.delete_guest_metrics ~__context ~self;
1528+
check_guest_agent ();
1529+
end
1530+
) info
1531+
with e ->
1532+
error "Caught %s: while updating VM %s last_start_time" (Printexc.to_string e) id
1533+
end;
14921534
Opt.iter
14931535
(fun (_, state) ->
14941536
List.iter
14951537
(fun domid ->
1538+
(* Guest metrics could have been destroyed during the last_start_time check
1539+
by recreating them, we avoid CA-223387 *)
1540+
create_guest_metrics_if_needed ();
14961541
if different (fun x -> x.uncooperative_balloon_driver) then begin
14971542
debug "xenopsd event: VM %s domid %d uncooperative_balloon_driver = %b" id domid state.uncooperative_balloon_driver;
14981543
end;
@@ -1526,30 +1571,6 @@ let update_vm ~__context id =
15261571
error "Caught %s: while updating VM %s VCPUs_number" (Printexc.to_string e) id
15271572
) info
15281573
end;
1529-
if different (fun x -> x.last_start_time) then begin
1530-
try
1531-
Opt.iter
1532-
(fun (_, state) ->
1533-
debug "xenopsd event: Updating VM %s last_start_time <- %s" id (Date.to_string (Date.of_float state.last_start_time));
1534-
let metrics = Db.VM.get_metrics ~__context ~self in
1535-
let start_time = Date.of_float state.last_start_time in
1536-
Db.VM_metrics.set_start_time ~__context ~self:metrics ~value:start_time;
1537-
begin
1538-
try
1539-
let gm = Db.VM.get_guest_metrics ~__context ~self in
1540-
let update_time = Db.VM_guest_metrics.get_last_updated ~__context ~self:gm in
1541-
if update_time < start_time then begin
1542-
debug "VM %s guest metrics update time (%s) < VM start time (%s): deleting"
1543-
id (Date.to_string update_time) (Date.to_string start_time);
1544-
Xapi_vm_helpers.delete_guest_metrics ~__context ~self;
1545-
check_guest_agent ();
1546-
end
1547-
with _ -> () (* The guest metrics didn't exist *)
1548-
end
1549-
) info
1550-
with e ->
1551-
error "Caught %s: while updating VM %s last_start_time" (Printexc.to_string e) id
1552-
end;
15531574
if different (fun x -> x.shadow_multiplier_target) then begin
15541575
try
15551576
Opt.iter

0 commit comments

Comments
 (0)