VirtualBox

source: vbox/trunk/src/VBox/HostDrivers/Support/SUPDrvGip.cpp@ 54327

Last change on this file since 54327 was 54327, checked in by vboxsync, 10 years ago

SUPDrv: Split out the GIP related code into SUPDrvGip.cpp.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 129.2 KB
Line 
1/* $Id: SUPDrvGip.cpp 54327 2015-02-20 13:35:30Z vboxsync $ */
2/** @file
3 * VBoxDrv - The VirtualBox Support Driver - Common code for GIP.
4 */
5
6/*
7 * Copyright (C) 2006-2015 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.215389.xyz. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * The contents of this file may alternatively be used under the terms
18 * of the Common Development and Distribution License Version 1.0
19 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20 * VirtualBox OSE distribution, in which case the provisions of the
21 * CDDL are applicable instead of those of the GPL.
22 *
23 * You may elect to license modified versions of this file under the
24 * terms and conditions of either the GPL or the CDDL or both.
25 */
26
27/*******************************************************************************
28* Header Files *
29*******************************************************************************/
30#define LOG_GROUP LOG_GROUP_SUP_DRV
31#define SUPDRV_AGNOSTIC
32#include "SUPDrvInternal.h"
33#ifndef PAGE_SHIFT
34# include <iprt/param.h>
35#endif
36#include <iprt/asm.h>
37#include <iprt/asm-amd64-x86.h>
38#include <iprt/asm-math.h>
39#include <iprt/cpuset.h>
40#include <iprt/handletable.h>
41#include <iprt/mem.h>
42#include <iprt/mp.h>
43#include <iprt/power.h>
44#include <iprt/process.h>
45#include <iprt/semaphore.h>
46#include <iprt/spinlock.h>
47#include <iprt/thread.h>
48#include <iprt/uuid.h>
49#include <iprt/net.h>
50#include <iprt/crc.h>
51#include <iprt/string.h>
52#include <iprt/timer.h>
53#if defined(RT_OS_DARWIN) || defined(RT_OS_SOLARIS) || defined(RT_OS_FREEBSD)
54# include <iprt/rand.h>
55# include <iprt/path.h>
56#endif
57#include <iprt/uint128.h>
58#include <iprt/x86.h>
59
60#include <VBox/param.h>
61#include <VBox/log.h>
62#include <VBox/err.h>
63
64#if defined(RT_OS_SOLARIS) || defined(RT_OS_DARWIN)
65# include "dtrace/SUPDrv.h"
66#else
67/* ... */
68#endif
69
70
71/*******************************************************************************
72* Defined Constants And Macros *
73*******************************************************************************/
74/** The frequency by which we recalculate the u32UpdateHz and
75 * u32UpdateIntervalNS GIP members. The value must be a power of 2.
76 *
77 * Warning: Bumping this too high might overflow u32UpdateIntervalNS.
78 */
79#define GIP_UPDATEHZ_RECALC_FREQ 0x800
80
81/** A reserved TSC value used for synchronization as well as measurement of
82 * TSC deltas. */
83#define GIP_TSC_DELTA_RSVD UINT64_MAX
84/** The number of TSC delta measurement loops in total (includes primer and
85 * read-time loops). */
86#define GIP_TSC_DELTA_LOOPS 96
87/** The number of cache primer loops. */
88#define GIP_TSC_DELTA_PRIMER_LOOPS 4
89/** The number of loops until we keep computing the minumum read time. */
90#define GIP_TSC_DELTA_READ_TIME_LOOPS 24
91/** Stop measurement of TSC delta. */
92#define GIP_TSC_DELTA_SYNC_STOP 0
93/** Start measurement of TSC delta. */
94#define GIP_TSC_DELTA_SYNC_START 1
95/** Worker thread is ready for reading the TSC. */
96#define GIP_TSC_DELTA_SYNC_WORKER_READY 2
97/** Worker thread is done updating TSC delta info. */
98#define GIP_TSC_DELTA_SYNC_WORKER_DONE 3
99/** When IPRT is isn't concurrent safe: Master is ready and will wait for worker
100 * with a timeout. */
101#define GIP_TSC_DELTA_SYNC_PRESTART_MASTER 4
102/** When IPRT is isn't concurrent safe: Worker is ready after waiting for
103 * master with a timeout. */
104#define GIP_TSC_DELTA_SYNC_PRESTART_WORKER 5
105/** The TSC-refinement interval in seconds. */
106#define GIP_TSC_REFINE_INTERVAL 5
107/** The TSC-delta threshold for the SUPGIPUSETSCDELTA_PRACTICALLY_ZERO rating */
108#define GIP_TSC_DELTA_THRESHOLD_PRACTICALLY_ZERO 32
109/** The TSC-delta threshold for the SUPGIPUSETSCDELTA_ROUGHLY_ZERO rating */
110#define GIP_TSC_DELTA_THRESHOLD_ROUGHLY_ZERO 448
111/** The TSC delta value for the initial GIP master - 0 in regular builds.
112 * To test the delta code this can be set to a non-zero value. */
113#if 1
114# define GIP_TSC_DELTA_INITIAL_MASTER_VALUE INT64_C(170139095182512) /* 0x00009abd9854acb0 */
115#else
116# define GIP_TSC_DELTA_INITIAL_MASTER_VALUE INT64_C(0)
117#endif
118
119AssertCompile(GIP_TSC_DELTA_PRIMER_LOOPS < GIP_TSC_DELTA_READ_TIME_LOOPS);
120AssertCompile(GIP_TSC_DELTA_PRIMER_LOOPS + GIP_TSC_DELTA_READ_TIME_LOOPS < GIP_TSC_DELTA_LOOPS);
121
122/** @def VBOX_SVN_REV
123 * The makefile should define this if it can. */
124#ifndef VBOX_SVN_REV
125# define VBOX_SVN_REV 0
126#endif
127
128#if 0 /* Don't start the GIP timers. Useful when debugging the IPRT timer code. */
129# define DO_NOT_START_GIP
130#endif
131
132/*******************************************************************************
133* Internal Functions *
134*******************************************************************************/
135static DECLCALLBACK(void) supdrvGipSyncAndInvariantTimer(PRTTIMER pTimer, void *pvUser, uint64_t iTick);
136static DECLCALLBACK(void) supdrvGipAsyncTimer(PRTTIMER pTimer, void *pvUser, uint64_t iTick);
137static DECLCALLBACK(void) supdrvGipMpEvent(RTMPEVENT enmEvent, RTCPUID idCpu, void *pvUser);
138static void supdrvGipInit(PSUPDRVDEVEXT pDevExt, PSUPGLOBALINFOPAGE pGip, RTHCPHYS HCPhys, uint64_t u64NanoTS,
139 unsigned uUpdateHz, unsigned uUpdateIntervalNS, unsigned cCpus);
140static DECLCALLBACK(void) supdrvGipInitOnCpu(RTCPUID idCpu, void *pvUser1, void *pvUser2);
141static void supdrvGipTerm(PSUPGLOBALINFOPAGE pGip);
142static void supdrvGipUpdate(PSUPDRVDEVEXT pDevExt, uint64_t u64NanoTS, uint64_t u64TSC, RTCPUID idCpu, uint64_t iTick);
143static void supdrvGipUpdatePerCpu(PSUPDRVDEVEXT pDevExt, uint64_t u64NanoTS, uint64_t u64TSC,
144 RTCPUID idCpu, uint8_t idApic, uint64_t iTick);
145static void supdrvGipInitCpu(PSUPDRVDEVEXT pDevExt, PSUPGLOBALINFOPAGE pGip, PSUPGIPCPU pCpu, uint64_t u64NanoTS);
146static int supdrvMeasureInitialTscDeltas(PSUPDRVDEVEXT pDevExt);
147static int supdrvMeasureTscDeltaOne(PSUPDRVDEVEXT pDevExt, uint32_t idxWorker);
148
149
150/*******************************************************************************
151* Global Variables *
152*******************************************************************************/
153DECLEXPORT(PSUPGLOBALINFOPAGE) g_pSUPGlobalInfoPage = NULL;
154
155
156
157/**
158 * (Re-)initializes the per-cpu structure prior to starting or resuming the GIP
159 * updating.
160 *
161 * @param pGip Pointer to the GIP.
162 * @param pGipCpu The per CPU structure for this CPU.
163 * @param u64NanoTS The current time.
164 */
165static void supdrvGipReInitCpu(PSUPGLOBALINFOPAGE pGip, PSUPGIPCPU pGipCpu, uint64_t u64NanoTS)
166{
167 /*
168 * Here we don't really care about applying the TSC delta. The re-initialization of this
169 * value is not relevant especially while (re)starting the GIP as the first few ones will
170 * be ignored anyway, see supdrvGipDoUpdateCpu().
171 */
172 pGipCpu->u64TSC = ASMReadTSC() - pGipCpu->u32UpdateIntervalTSC;
173 pGipCpu->u64NanoTS = u64NanoTS;
174}
175
176
177/**
178 * Set the current TSC and NanoTS value for the CPU.
179 *
180 * @param idCpu The CPU ID. Unused - we have to use the APIC ID.
181 * @param pvUser1 Pointer to the ring-0 GIP mapping.
182 * @param pvUser2 Pointer to the variable holding the current time.
183 */
184static DECLCALLBACK(void) supdrvGipReInitCpuCallback(RTCPUID idCpu, void *pvUser1, void *pvUser2)
185{
186 PSUPGLOBALINFOPAGE pGip = (PSUPGLOBALINFOPAGE)pvUser1;
187 unsigned iCpu = pGip->aiCpuFromApicId[ASMGetApicId()];
188
189 if (RT_LIKELY(iCpu < pGip->cCpus && pGip->aCPUs[iCpu].idCpu == idCpu))
190 supdrvGipReInitCpu(pGip, &pGip->aCPUs[iCpu], *(uint64_t *)pvUser2);
191
192 NOREF(pvUser2);
193 NOREF(idCpu);
194}
195
196
197/**
198 * State structure for supdrvGipDetectGetGipCpuCallback.
199 */
200typedef struct SUPDRVGIPDETECTGETCPU
201{
202 /** Bitmap of APIC IDs that has been seen (initialized to zero).
203 * Used to detect duplicate APIC IDs (paranoia). */
204 uint8_t volatile bmApicId[256 / 8];
205 /** Mask of supported GIP CPU getter methods (SUPGIPGETCPU_XXX) (all bits set
206 * initially). The callback clears the methods not detected. */
207 uint32_t volatile fSupported;
208 /** The first callback detecting any kind of range issues (initialized to
209 * NIL_RTCPUID). */
210 RTCPUID volatile idCpuProblem;
211} SUPDRVGIPDETECTGETCPU;
212/** Pointer to state structure for supdrvGipDetectGetGipCpuCallback. */
213typedef SUPDRVGIPDETECTGETCPU *PSUPDRVGIPDETECTGETCPU;
214
215
216/**
217 * Checks for alternative ways of getting the CPU ID.
218 *
219 * This also checks the APIC ID, CPU ID and CPU set index values against the
220 * GIP tables.
221 *
222 * @param idCpu The CPU ID. Unused - we have to use the APIC ID.
223 * @param pvUser1 Pointer to the state structure.
224 * @param pvUser2 Pointer to the GIP.
225 */
226static DECLCALLBACK(void) supdrvGipDetectGetGipCpuCallback(RTCPUID idCpu, void *pvUser1, void *pvUser2)
227{
228 PSUPDRVGIPDETECTGETCPU pState = (PSUPDRVGIPDETECTGETCPU)pvUser1;
229 PSUPGLOBALINFOPAGE pGip = (PSUPGLOBALINFOPAGE)pvUser2;
230 uint32_t fSupported = 0;
231 uint16_t idApic;
232 int iCpuSet;
233
234 AssertMsg(idCpu == RTMpCpuId(), ("idCpu=%#x RTMpCpuId()=%#x\n", idCpu, RTMpCpuId())); /* paranoia^3 */
235
236 /*
237 * Check that the CPU ID and CPU set index are interchangable.
238 */
239 iCpuSet = RTMpCpuIdToSetIndex(idCpu);
240 if ((RTCPUID)iCpuSet == idCpu)
241 {
242 AssertCompile(RT_IS_POWER_OF_TWO(RTCPUSET_MAX_CPUS));
243 if ( iCpuSet >= 0
244 && iCpuSet < RTCPUSET_MAX_CPUS
245 && RT_IS_POWER_OF_TWO(RTCPUSET_MAX_CPUS))
246 {
247 /*
248 * Check whether the IDTR.LIMIT contains a CPU number.
249 */
250#ifdef RT_ARCH_X86
251 uint16_t const cbIdt = sizeof(X86DESC64SYSTEM) * 256;
252#else
253 uint16_t const cbIdt = sizeof(X86DESCGATE) * 256;
254#endif
255 RTIDTR Idtr;
256 ASMGetIDTR(&Idtr);
257 if (Idtr.cbIdt >= cbIdt)
258 {
259 uint32_t uTmp = Idtr.cbIdt - cbIdt;
260 uTmp &= RTCPUSET_MAX_CPUS - 1;
261 if (uTmp == idCpu)
262 {
263 RTIDTR Idtr2;
264 ASMGetIDTR(&Idtr2);
265 if (Idtr2.cbIdt == Idtr.cbIdt)
266 fSupported |= SUPGIPGETCPU_IDTR_LIMIT_MASK_MAX_SET_CPUS;
267 }
268 }
269
270 /*
271 * Check whether RDTSCP is an option.
272 */
273 if (ASMHasCpuId())
274 {
275 if ( ASMIsValidExtRange(ASMCpuId_EAX(UINT32_C(0x80000000)))
276 && (ASMCpuId_EDX(UINT32_C(0x80000001)) & X86_CPUID_EXT_FEATURE_EDX_RDTSCP) )
277 {
278 uint32_t uAux;
279 ASMReadTscWithAux(&uAux);
280 if ((uAux & (RTCPUSET_MAX_CPUS - 1)) == idCpu)
281 {
282 ASMNopPause();
283 ASMReadTscWithAux(&uAux);
284 if ((uAux & (RTCPUSET_MAX_CPUS - 1)) == idCpu)
285 fSupported |= SUPGIPGETCPU_RDTSCP_MASK_MAX_SET_CPUS;
286 }
287 }
288 }
289 }
290 }
291
292 /*
293 * Check that the APIC ID is unique.
294 */
295 idApic = ASMGetApicId();
296 if (RT_LIKELY( idApic < RT_ELEMENTS(pGip->aiCpuFromApicId)
297 && !ASMAtomicBitTestAndSet(pState->bmApicId, idApic)))
298 fSupported |= SUPGIPGETCPU_APIC_ID;
299 else
300 {
301 AssertCompile(sizeof(pState->bmApicId) * 8 == RT_ELEMENTS(pGip->aiCpuFromApicId));
302 ASMAtomicCmpXchgU32(&pState->idCpuProblem, idCpu, NIL_RTCPUID);
303 LogRel(("supdrvGipDetectGetGipCpuCallback: idCpu=%#x iCpuSet=%d idApic=%#x - duplicate APIC ID.\n",
304 idCpu, iCpuSet, idApic));
305 }
306
307 /*
308 * Check that the iCpuSet is within the expected range.
309 */
310 if (RT_UNLIKELY( iCpuSet < 0
311 || (unsigned)iCpuSet >= RTCPUSET_MAX_CPUS
312 || (unsigned)iCpuSet >= RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)))
313 {
314 ASMAtomicCmpXchgU32(&pState->idCpuProblem, idCpu, NIL_RTCPUID);
315 LogRel(("supdrvGipDetectGetGipCpuCallback: idCpu=%#x iCpuSet=%d idApic=%#x - CPU set index is out of range.\n",
316 idCpu, iCpuSet, idApic));
317 }
318 else
319 {
320 RTCPUID idCpu2 = RTMpCpuIdFromSetIndex(iCpuSet);
321 if (RT_UNLIKELY(idCpu2 != idCpu))
322 {
323 ASMAtomicCmpXchgU32(&pState->idCpuProblem, idCpu, NIL_RTCPUID);
324 LogRel(("supdrvGipDetectGetGipCpuCallback: idCpu=%#x iCpuSet=%d idApic=%#x - CPU id/index roundtrip problem: %#x\n",
325 idCpu, iCpuSet, idApic, idCpu2));
326 }
327 }
328
329 /*
330 * Update the supported feature mask before we return.
331 */
332 ASMAtomicAndU32(&pState->fSupported, fSupported);
333
334 NOREF(pvUser2);
335}
336
337
338/**
339 * Increase the timer freqency on hosts where this is possible (NT).
340 *
341 * The idea is that more interrupts is better for us... Also, it's better than
342 * we increase the timer frequence, because we might end up getting inaccurate
343 * callbacks if someone else does it.
344 *
345 * @param pDevExt Sets u32SystemTimerGranularityGrant if increased.
346 */
347static void supdrvGipRequestHigherTimerFrequencyFromSystem(PSUPDRVDEVEXT pDevExt)
348{
349 if (pDevExt->u32SystemTimerGranularityGrant == 0)
350 {
351 uint32_t u32SystemResolution;
352 if ( RT_SUCCESS_NP(RTTimerRequestSystemGranularity( 976563 /* 1024 HZ */, &u32SystemResolution))
353 || RT_SUCCESS_NP(RTTimerRequestSystemGranularity( 1000000 /* 1000 HZ */, &u32SystemResolution))
354 || RT_SUCCESS_NP(RTTimerRequestSystemGranularity( 1953125 /* 512 HZ */, &u32SystemResolution))
355 || RT_SUCCESS_NP(RTTimerRequestSystemGranularity( 2000000 /* 500 HZ */, &u32SystemResolution))
356 )
357 {
358 Assert(RTTimerGetSystemGranularity() <= u32SystemResolution);
359 pDevExt->u32SystemTimerGranularityGrant = u32SystemResolution;
360 }
361 }
362}
363
364
365/**
366 * Undoes supdrvGipRequestHigherTimerFrequencyFromSystem.
367 *
368 * @param pDevExt Clears u32SystemTimerGranularityGrant.
369 */
370static void supdrvGipReleaseHigherTimerFrequencyFromSystem(PSUPDRVDEVEXT pDevExt)
371{
372 if (pDevExt->u32SystemTimerGranularityGrant)
373 {
374 int rc2 = RTTimerReleaseSystemGranularity(pDevExt->u32SystemTimerGranularityGrant);
375 AssertRC(rc2);
376 pDevExt->u32SystemTimerGranularityGrant = 0;
377 }
378}
379
380
381/**
382 * Maps the GIP into userspace and/or get the physical address of the GIP.
383 *
384 * @returns IPRT status code.
385 * @param pSession Session to which the GIP mapping should belong.
386 * @param ppGipR3 Where to store the address of the ring-3 mapping. (optional)
387 * @param pHCPhysGip Where to store the physical address. (optional)
388 *
389 * @remark There is no reference counting on the mapping, so one call to this function
390 * count globally as one reference. One call to SUPR0GipUnmap() is will unmap GIP
391 * and remove the session as a GIP user.
392 */
393SUPR0DECL(int) SUPR0GipMap(PSUPDRVSESSION pSession, PRTR3PTR ppGipR3, PRTHCPHYS pHCPhysGip)
394{
395 int rc;
396 PSUPDRVDEVEXT pDevExt = pSession->pDevExt;
397 RTR3PTR pGipR3 = NIL_RTR3PTR;
398 RTHCPHYS HCPhys = NIL_RTHCPHYS;
399 LogFlow(("SUPR0GipMap: pSession=%p ppGipR3=%p pHCPhysGip=%p\n", pSession, ppGipR3, pHCPhysGip));
400
401 /*
402 * Validate
403 */
404 AssertReturn(SUP_IS_SESSION_VALID(pSession), VERR_INVALID_PARAMETER);
405 AssertPtrNullReturn(ppGipR3, VERR_INVALID_POINTER);
406 AssertPtrNullReturn(pHCPhysGip, VERR_INVALID_POINTER);
407
408#ifdef SUPDRV_USE_MUTEX_FOR_GIP
409 RTSemMutexRequest(pDevExt->mtxGip, RT_INDEFINITE_WAIT);
410#else
411 RTSemFastMutexRequest(pDevExt->mtxGip);
412#endif
413 if (pDevExt->pGip)
414 {
415 /*
416 * Map it?
417 */
418 rc = VINF_SUCCESS;
419 if (ppGipR3)
420 {
421 if (pSession->GipMapObjR3 == NIL_RTR0MEMOBJ)
422 rc = RTR0MemObjMapUser(&pSession->GipMapObjR3, pDevExt->GipMemObj, (RTR3PTR)-1, 0,
423 RTMEM_PROT_READ, RTR0ProcHandleSelf());
424 if (RT_SUCCESS(rc))
425 pGipR3 = RTR0MemObjAddressR3(pSession->GipMapObjR3);
426 }
427
428 /*
429 * Get physical address.
430 */
431 if (pHCPhysGip && RT_SUCCESS(rc))
432 HCPhys = pDevExt->HCPhysGip;
433
434 /*
435 * Reference globally.
436 */
437 if (!pSession->fGipReferenced && RT_SUCCESS(rc))
438 {
439 pSession->fGipReferenced = 1;
440 pDevExt->cGipUsers++;
441 if (pDevExt->cGipUsers == 1)
442 {
443 PSUPGLOBALINFOPAGE pGipR0 = pDevExt->pGip;
444 uint64_t u64NanoTS;
445
446 /*
447 * GIP starts/resumes updating again. On windows we bump the
448 * host timer frequency to make sure we don't get stuck in guest
449 * mode and to get better timer (and possibly clock) accuracy.
450 */
451 LogFlow(("SUPR0GipMap: Resumes GIP updating\n"));
452
453 supdrvGipRequestHigherTimerFrequencyFromSystem(pDevExt);
454
455 /*
456 * document me
457 */
458 if (pGipR0->aCPUs[0].u32TransactionId != 2 /* not the first time */)
459 {
460 unsigned i;
461 for (i = 0; i < pGipR0->cCpus; i++)
462 ASMAtomicUoWriteU32(&pGipR0->aCPUs[i].u32TransactionId,
463 (pGipR0->aCPUs[i].u32TransactionId + GIP_UPDATEHZ_RECALC_FREQ * 2)
464 & ~(GIP_UPDATEHZ_RECALC_FREQ * 2 - 1));
465 ASMAtomicWriteU64(&pGipR0->u64NanoTSLastUpdateHz, 0);
466 }
467
468 /*
469 * document me
470 */
471 u64NanoTS = RTTimeSystemNanoTS() - pGipR0->u32UpdateIntervalNS;
472 if ( pGipR0->u32Mode == SUPGIPMODE_INVARIANT_TSC
473 || pGipR0->u32Mode == SUPGIPMODE_SYNC_TSC
474 || RTMpGetOnlineCount() == 1)
475 supdrvGipReInitCpu(pGipR0, &pGipR0->aCPUs[0], u64NanoTS);
476 else
477 RTMpOnAll(supdrvGipReInitCpuCallback, pGipR0, &u64NanoTS);
478
479 /*
480 * Detect alternative ways to figure the CPU ID in ring-3 and
481 * raw-mode context. Check the sanity of the APIC IDs, CPU IDs,
482 * and CPU set indexes while we're at it.
483 */
484 if (RT_SUCCESS(rc))
485 {
486 SUPDRVGIPDETECTGETCPU DetectState;
487 RT_BZERO((void *)&DetectState.bmApicId, sizeof(DetectState.bmApicId));
488 DetectState.fSupported = UINT32_MAX;
489 DetectState.idCpuProblem = NIL_RTCPUID;
490 rc = RTMpOnAll(supdrvGipDetectGetGipCpuCallback, &DetectState, pGipR0);
491 if (DetectState.idCpuProblem == NIL_RTCPUID)
492 {
493 if ( DetectState.fSupported != UINT32_MAX
494 && DetectState.fSupported != 0)
495 {
496 if (pGipR0->fGetGipCpu != DetectState.fSupported)
497 {
498 pGipR0->fGetGipCpu = DetectState.fSupported;
499 LogRel(("SUPR0GipMap: fGetGipCpu=%#x\n", DetectState.fSupported));
500 }
501 }
502 else
503 {
504 LogRel(("SUPR0GipMap: No supported ways of getting the APIC ID or CPU number in ring-3! (%#x)\n",
505 DetectState.fSupported));
506 rc = VERR_UNSUPPORTED_CPU;
507 }
508 }
509 else
510 {
511 LogRel(("SUPR0GipMap: APIC ID, CPU ID or CPU set index problem detected on CPU #%u (%#x)!\n",
512 DetectState.idCpuProblem, DetectState.idCpuProblem));
513 rc = VERR_INVALID_CPU_ID;
514 }
515 }
516
517 /*
518 * Start the GIP timer if all is well..
519 */
520 if (RT_SUCCESS(rc))
521 {
522#ifndef DO_NOT_START_GIP
523 rc = RTTimerStart(pDevExt->pGipTimer, 0 /* fire ASAP */); AssertRC(rc);
524#endif
525 rc = VINF_SUCCESS;
526 }
527
528 /*
529 * Bail out on error.
530 */
531 if (RT_FAILURE(rc))
532 {
533 LogRel(("SUPR0GipMap: failed rc=%Rrc\n", rc));
534 pDevExt->cGipUsers = 0;
535 pSession->fGipReferenced = 0;
536 if (pSession->GipMapObjR3 != NIL_RTR0MEMOBJ)
537 {
538 int rc2 = RTR0MemObjFree(pSession->GipMapObjR3, false); AssertRC(rc2);
539 if (RT_SUCCESS(rc2))
540 pSession->GipMapObjR3 = NIL_RTR0MEMOBJ;
541 }
542 HCPhys = NIL_RTHCPHYS;
543 pGipR3 = NIL_RTR3PTR;
544 }
545 }
546 }
547 }
548 else
549 {
550 rc = VERR_GENERAL_FAILURE;
551 Log(("SUPR0GipMap: GIP is not available!\n"));
552 }
553#ifdef SUPDRV_USE_MUTEX_FOR_GIP
554 RTSemMutexRelease(pDevExt->mtxGip);
555#else
556 RTSemFastMutexRelease(pDevExt->mtxGip);
557#endif
558
559 /*
560 * Write returns.
561 */
562 if (pHCPhysGip)
563 *pHCPhysGip = HCPhys;
564 if (ppGipR3)
565 *ppGipR3 = pGipR3;
566
567#ifdef DEBUG_DARWIN_GIP
568 OSDBGPRINT(("SUPR0GipMap: returns %d *pHCPhysGip=%lx pGipR3=%p\n", rc, (unsigned long)HCPhys, (void *)pGipR3));
569#else
570 LogFlow(( "SUPR0GipMap: returns %d *pHCPhysGip=%lx pGipR3=%p\n", rc, (unsigned long)HCPhys, (void *)pGipR3));
571#endif
572 return rc;
573}
574
575
576/**
577 * Unmaps any user mapping of the GIP and terminates all GIP access
578 * from this session.
579 *
580 * @returns IPRT status code.
581 * @param pSession Session to which the GIP mapping should belong.
582 */
583SUPR0DECL(int) SUPR0GipUnmap(PSUPDRVSESSION pSession)
584{
585 int rc = VINF_SUCCESS;
586 PSUPDRVDEVEXT pDevExt = pSession->pDevExt;
587#ifdef DEBUG_DARWIN_GIP
588 OSDBGPRINT(("SUPR0GipUnmap: pSession=%p pGip=%p GipMapObjR3=%p\n",
589 pSession,
590 pSession->GipMapObjR3 != NIL_RTR0MEMOBJ ? RTR0MemObjAddress(pSession->GipMapObjR3) : NULL,
591 pSession->GipMapObjR3));
592#else
593 LogFlow(("SUPR0GipUnmap: pSession=%p\n", pSession));
594#endif
595 AssertReturn(SUP_IS_SESSION_VALID(pSession), VERR_INVALID_PARAMETER);
596
597#ifdef SUPDRV_USE_MUTEX_FOR_GIP
598 RTSemMutexRequest(pDevExt->mtxGip, RT_INDEFINITE_WAIT);
599#else
600 RTSemFastMutexRequest(pDevExt->mtxGip);
601#endif
602
603 /*
604 * Unmap anything?
605 */
606 if (pSession->GipMapObjR3 != NIL_RTR0MEMOBJ)
607 {
608 rc = RTR0MemObjFree(pSession->GipMapObjR3, false);
609 AssertRC(rc);
610 if (RT_SUCCESS(rc))
611 pSession->GipMapObjR3 = NIL_RTR0MEMOBJ;
612 }
613
614 /*
615 * Dereference global GIP.
616 */
617 if (pSession->fGipReferenced && !rc)
618 {
619 pSession->fGipReferenced = 0;
620 if ( pDevExt->cGipUsers > 0
621 && !--pDevExt->cGipUsers)
622 {
623 LogFlow(("SUPR0GipUnmap: Suspends GIP updating\n"));
624#ifndef DO_NOT_START_GIP
625 rc = RTTimerStop(pDevExt->pGipTimer); AssertRC(rc); rc = VINF_SUCCESS;
626#endif
627 supdrvGipReleaseHigherTimerFrequencyFromSystem(pDevExt);
628 }
629 }
630
631#ifdef SUPDRV_USE_MUTEX_FOR_GIP
632 RTSemMutexRelease(pDevExt->mtxGip);
633#else
634 RTSemFastMutexRelease(pDevExt->mtxGip);
635#endif
636
637 return rc;
638}
639
640
641/**
642 * Gets the GIP pointer.
643 *
644 * @returns Pointer to the GIP or NULL.
645 */
646SUPDECL(PSUPGLOBALINFOPAGE) SUPGetGIP(void)
647{
648 return g_pSUPGlobalInfoPage;
649}
650
651#ifdef SUPDRV_USE_TSC_DELTA_THREAD
652
653/**
654 * Switches the TSC-delta measurement thread into the butchered state.
655 *
656 * @returns VBox status code.
657 * @param pDevExt Pointer to the device instance data.
658 * @param fSpinlockHeld Whether the TSC-delta spinlock is held or not.
659 * @param pszFailed An error message to log.
660 * @param rcFailed The error code to exit the thread with.
661 */
662static int supdrvTscDeltaThreadButchered(PSUPDRVDEVEXT pDevExt, bool fSpinlockHeld, const char *pszFailed, int rcFailed)
663{
664 if (!fSpinlockHeld)
665 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
666
667 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Butchered;
668 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
669 OSDBGPRINT(("supdrvTscDeltaThreadButchered: %s. rc=%Rrc\n", rcFailed));
670 return rcFailed;
671}
672
673
674/**
675 * The TSC-delta measurement thread.
676 *
677 * @returns VBox status code.
678 * @param hThread The thread handle.
679 * @param pvUser Opaque pointer to the device instance data.
680 */
681static DECLCALLBACK(int) supdrvTscDeltaThread(RTTHREAD hThread, void *pvUser)
682{
683 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser;
684 bool fInitialMeasurement = true;
685 uint32_t cConsecutiveTimeouts = 0;
686 int rc = VERR_INTERNAL_ERROR_2;
687 for (;;)
688 {
689 /*
690 * Switch on the current state.
691 */
692 SUPDRVTSCDELTATHREADSTATE enmState;
693 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
694 enmState = pDevExt->enmTscDeltaThreadState;
695 switch (enmState)
696 {
697 case kTscDeltaThreadState_Creating:
698 {
699 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Listening;
700 rc = RTSemEventSignal(pDevExt->hTscDeltaEvent);
701 if (RT_FAILURE(rc))
702 return supdrvTscDeltaThreadButchered(pDevExt, true /* fSpinlockHeld */, "RTSemEventSignal", rc);
703 /* fall thru */
704 }
705
706 case kTscDeltaThreadState_Listening:
707 {
708 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
709
710 /* Simple adaptive timeout. */
711 if (cConsecutiveTimeouts++ == 10)
712 {
713 if (pDevExt->cMsTscDeltaTimeout == 1) /* 10 ms */
714 pDevExt->cMsTscDeltaTimeout = 10;
715 else if (pDevExt->cMsTscDeltaTimeout == 10) /* +100 ms */
716 pDevExt->cMsTscDeltaTimeout = 100;
717 else if (pDevExt->cMsTscDeltaTimeout == 100) /* +1000 ms */
718 pDevExt->cMsTscDeltaTimeout = 500;
719 cConsecutiveTimeouts = 0;
720 }
721 rc = RTThreadUserWait(pDevExt->hTscDeltaThread, pDevExt->cMsTscDeltaTimeout);
722 if ( RT_FAILURE(rc)
723 && rc != VERR_TIMEOUT)
724 return supdrvTscDeltaThreadButchered(pDevExt, false /* fSpinlockHeld */, "RTThreadUserWait", rc);
725 RTThreadUserReset(pDevExt->hTscDeltaThread);
726 break;
727 }
728
729 case kTscDeltaThreadState_WaitAndMeasure:
730 {
731 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Measuring;
732 rc = RTSemEventSignal(pDevExt->hTscDeltaEvent); /* (Safe on windows as long as spinlock isn't IRQ safe.) */
733 if (RT_FAILURE(rc))
734 return supdrvTscDeltaThreadButchered(pDevExt, true /* fSpinlockHeld */, "RTSemEventSignal", rc);
735 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
736 pDevExt->cMsTscDeltaTimeout = 1;
737 RTThreadSleep(10);
738 /* fall thru */
739 }
740
741 case kTscDeltaThreadState_Measuring:
742 {
743 cConsecutiveTimeouts = 0;
744 if (fInitialMeasurement)
745 {
746 int cTries = 8;
747 int cMsWaitPerTry = 10;
748 fInitialMeasurement = false;
749 do
750 {
751 rc = supdrvMeasureInitialTscDeltas(pDevExt);
752 if ( RT_SUCCESS(rc)
753 || ( RT_FAILURE(rc)
754 && rc != VERR_TRY_AGAIN
755 && rc != VERR_CPU_OFFLINE))
756 {
757 break;
758 }
759 RTThreadSleep(cMsWaitPerTry);
760 } while (cTries-- > 0);
761 }
762 else
763 {
764 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
765 unsigned iCpu;
766
767 /* Measure TSC-deltas only for the CPUs that are in the set. */
768 rc = VINF_SUCCESS;
769 for (iCpu = 0; iCpu < pGip->cCpus; iCpu++)
770 {
771 PSUPGIPCPU pGipCpuWorker = &pGip->aCPUs[iCpu];
772 if ( pGipCpuWorker->i64TSCDelta == INT64_MAX
773 && RTCpuSetIsMemberByIndex(&pDevExt->TscDeltaCpuSet, pGipCpuWorker->iCpuSet))
774 {
775 rc |= supdrvMeasureTscDeltaOne(pDevExt, iCpu);
776 }
777 }
778 }
779 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
780 if (pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_Measuring)
781 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Listening;
782 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
783 Assert(rc != VERR_NOT_AVAILABLE); /* VERR_NOT_AVAILABLE is used as the initial value. */
784 ASMAtomicWriteS32(&pDevExt->rcTscDelta, rc);
785 break;
786 }
787
788 case kTscDeltaThreadState_Terminating:
789 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Destroyed;
790 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
791 return VINF_SUCCESS;
792
793 case kTscDeltaThreadState_Butchered:
794 default:
795 return supdrvTscDeltaThreadButchered(pDevExt, true /* fSpinlockHeld */, "Invalid state", VERR_INVALID_STATE);
796 }
797 }
798
799 return rc;
800}
801
802
803/**
804 * Waits for the TSC-delta measurement thread to respond to a state change.
805 *
806 * @returns VINF_SUCCESS on success, VERR_TIMEOUT if it doesn't respond in time,
807 * other error code on internal error.
808 *
809 * @param pThis Pointer to the grant service instance data.
810 * @param enmCurState The current state.
811 * @param enmNewState The new state we're waiting for it to enter.
812 */
813static int supdrvTscDeltaThreadWait(PSUPDRVDEVEXT pDevExt, SUPDRVTSCDELTATHREADSTATE enmCurState,
814 SUPDRVTSCDELTATHREADSTATE enmNewState)
815{
816 /*
817 * Wait a short while for the expected state transition.
818 */
819 int rc;
820 RTSemEventWait(pDevExt->hTscDeltaEvent, RT_MS_1SEC);
821 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
822 if (pDevExt->enmTscDeltaThreadState == enmNewState)
823 {
824 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
825 rc = VINF_SUCCESS;
826 }
827 else if (pDevExt->enmTscDeltaThreadState == enmCurState)
828 {
829 /*
830 * Wait longer if the state has not yet transitioned to the one we want.
831 */
832 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
833 rc = RTSemEventWait(pDevExt->hTscDeltaEvent, 50 * RT_MS_1SEC);
834 if ( RT_SUCCESS(rc)
835 || rc == VERR_TIMEOUT)
836 {
837 /*
838 * Check the state whether we've succeeded.
839 */
840 SUPDRVTSCDELTATHREADSTATE enmState;
841 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
842 enmState = pDevExt->enmTscDeltaThreadState;
843 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
844 if (enmState == enmNewState)
845 rc = VINF_SUCCESS;
846 else if (enmState == enmCurState)
847 {
848 rc = VERR_TIMEOUT;
849 OSDBGPRINT(("supdrvTscDeltaThreadWait: timed out state transition. enmState=%d enmNewState=%d\n", enmState,
850 enmNewState));
851 }
852 else
853 {
854 rc = VERR_INTERNAL_ERROR;
855 OSDBGPRINT(("supdrvTscDeltaThreadWait: invalid state transition from %d to %d, expected %d\n", enmCurState,
856 enmState, enmNewState));
857 }
858 }
859 else
860 OSDBGPRINT(("supdrvTscDeltaThreadWait: RTSemEventWait failed. rc=%Rrc\n", rc));
861 }
862 else
863 {
864 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
865 OSDBGPRINT(("supdrvTscDeltaThreadWait: invalid state transition from %d to %d\n", enmCurState, enmNewState));
866 rc = VERR_INTERNAL_ERROR;
867 }
868
869 return rc;
870}
871
872
873/**
874 * Terminates the TSC-delta measurement thread.
875 *
876 * @param pDevExt Pointer to the device instance data.
877 */
878static void supdrvTscDeltaThreadTerminate(PSUPDRVDEVEXT pDevExt)
879{
880 int rc;
881 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
882 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Terminating;
883 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
884 RTThreadUserSignal(pDevExt->hTscDeltaThread);
885 rc = RTThreadWait(pDevExt->hTscDeltaThread, 50 * RT_MS_1SEC, NULL /* prc */);
886 if (RT_FAILURE(rc))
887 {
888 /* Signal a few more times before giving up. */
889 int cTriesLeft = 5;
890 while (--cTriesLeft > 0)
891 {
892 RTThreadUserSignal(pDevExt->hTscDeltaThread);
893 rc = RTThreadWait(pDevExt->hTscDeltaThread, 2 * RT_MS_1SEC, NULL /* prc */);
894 if (rc != VERR_TIMEOUT)
895 break;
896 }
897 }
898}
899
900
901/**
902 * Initializes and spawns the TSC-delta measurement thread.
903 *
904 * A thread is required for servicing re-measurement requests from events like
905 * CPUs coming online, suspend/resume etc. as it cannot be done synchronously
906 * under all contexts on all OSs.
907 *
908 * @returns VBox status code.
909 * @param pDevExt Pointer to the device instance data.
910 *
911 * @remarks Must only be called -after- initializing GIP and setting up MP
912 * notifications!
913 */
914static int supdrvTscDeltaThreadInit(PSUPDRVDEVEXT pDevExt)
915{
916 int rc;
917 Assert(pDevExt->pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED);
918 rc = RTSpinlockCreate(&pDevExt->hTscDeltaSpinlock, RTSPINLOCK_FLAGS_INTERRUPT_UNSAFE, "VBoxTscSpnLck");
919 if (RT_SUCCESS(rc))
920 {
921 rc = RTSemEventCreate(&pDevExt->hTscDeltaEvent);
922 if (RT_SUCCESS(rc))
923 {
924 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Creating;
925 pDevExt->cMsTscDeltaTimeout = 1;
926 rc = RTThreadCreate(&pDevExt->hTscDeltaThread, supdrvTscDeltaThread, pDevExt, 0 /* cbStack */,
927 RTTHREADTYPE_DEFAULT, RTTHREADFLAGS_WAITABLE, "VBoxTscThread");
928 if (RT_SUCCESS(rc))
929 {
930 rc = supdrvTscDeltaThreadWait(pDevExt, kTscDeltaThreadState_Creating, kTscDeltaThreadState_Listening);
931 if (RT_SUCCESS(rc))
932 {
933 ASMAtomicWriteS32(&pDevExt->rcTscDelta, VERR_NOT_AVAILABLE);
934 return rc;
935 }
936
937 OSDBGPRINT(("supdrvTscDeltaInit: supdrvTscDeltaThreadWait failed. rc=%Rrc\n", rc));
938 supdrvTscDeltaThreadTerminate(pDevExt);
939 }
940 else
941 OSDBGPRINT(("supdrvTscDeltaInit: RTThreadCreate failed. rc=%Rrc\n", rc));
942 RTSemEventDestroy(pDevExt->hTscDeltaEvent);
943 pDevExt->hTscDeltaEvent = NIL_RTSEMEVENT;
944 }
945 else
946 OSDBGPRINT(("supdrvTscDeltaInit: RTSemEventCreate failed. rc=%Rrc\n", rc));
947 RTSpinlockDestroy(pDevExt->hTscDeltaSpinlock);
948 pDevExt->hTscDeltaSpinlock = NIL_RTSPINLOCK;
949 }
950 else
951 OSDBGPRINT(("supdrvTscDeltaInit: RTSpinlockCreate failed. rc=%Rrc\n", rc));
952
953 return rc;
954}
955
956
957/**
958 * Terminates the TSC-delta measurement thread and cleanup.
959 *
960 * @param pDevExt Pointer to the device instance data.
961 */
962static void supdrvTscDeltaTerm(PSUPDRVDEVEXT pDevExt)
963{
964 if ( pDevExt->hTscDeltaSpinlock != NIL_RTSPINLOCK
965 && pDevExt->hTscDeltaEvent != NIL_RTSEMEVENT)
966 {
967 supdrvTscDeltaThreadTerminate(pDevExt);
968 }
969
970 if (pDevExt->hTscDeltaSpinlock != NIL_RTSPINLOCK)
971 {
972 RTSpinlockDestroy(pDevExt->hTscDeltaSpinlock);
973 pDevExt->hTscDeltaSpinlock = NIL_RTSPINLOCK;
974 }
975
976 if (pDevExt->hTscDeltaEvent != NIL_RTSEMEVENT)
977 {
978 RTSemEventDestroy(pDevExt->hTscDeltaEvent);
979 pDevExt->hTscDeltaEvent = NIL_RTSEMEVENT;
980 }
981
982 ASMAtomicWriteS32(&pDevExt->rcTscDelta, VERR_NOT_AVAILABLE);
983}
984
985
986/**
987 * Waits for TSC-delta measurements to be completed for all online CPUs.
988 *
989 * @returns VBox status code.
990 * @param pDevExt Pointer to the device instance data.
991 */
992static int supdrvTscDeltaThreadWaitForOnlineCpus(PSUPDRVDEVEXT pDevExt)
993{
994 int cTriesLeft = 5;
995 int cMsTotalWait;
996 int cMsWaited = 0;
997 int cMsWaitGranularity = 1;
998
999 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
1000 AssertReturn(pGip, VERR_INVALID_POINTER);
1001
1002 if (RT_UNLIKELY(pDevExt->hTscDeltaThread == NIL_RTTHREAD))
1003 return VERR_THREAD_NOT_WAITABLE;
1004
1005 cMsTotalWait = RT_MIN(pGip->cPresentCpus + 10, 200);
1006 while (cTriesLeft-- > 0)
1007 {
1008 if (RTCpuSetIsEqual(&pDevExt->TscDeltaObtainedCpuSet, &pGip->OnlineCpuSet))
1009 return VINF_SUCCESS;
1010 RTThreadSleep(cMsWaitGranularity);
1011 cMsWaited += cMsWaitGranularity;
1012 if (cMsWaited >= cMsTotalWait)
1013 break;
1014 }
1015
1016 return VERR_TIMEOUT;
1017}
1018
1019#endif /* SUPDRV_USE_TSC_DELTA_THREAD */
1020
1021/**
1022 * Applies the TSC delta to the supplied raw TSC value.
1023 *
1024 * @returns VBox status code. (Ignored by all users, just FYI.)
1025 * @param pGip Pointer to the GIP.
1026 * @param puTsc Pointer to a valid TSC value before the TSC delta has been applied.
1027 * @param idApic The APIC ID of the CPU @c puTsc corresponds to.
1028 * @param fDeltaApplied Where to store whether the TSC delta was succesfully
1029 * applied or not (optional, can be NULL).
1030 *
1031 * @remarks Maybe called with interrupts disabled in ring-0!
1032 *
1033 * @note Don't you dare change the delta calculation. If you really do, make
1034 * sure you update all places where it's used (IPRT, SUPLibAll.cpp,
1035 * SUPDrv.c, supdrvGipMpEvent, and more).
1036 */
1037DECLINLINE(int) supdrvTscDeltaApply(PSUPGLOBALINFOPAGE pGip, uint64_t *puTsc, uint16_t idApic, bool *pfDeltaApplied)
1038{
1039 int rc;
1040
1041 /*
1042 * Validate input.
1043 */
1044 AssertPtr(puTsc);
1045 AssertPtr(pGip);
1046 Assert(pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED);
1047
1048 /*
1049 * Carefully convert the idApic into a GIPCPU entry.
1050 */
1051 if (RT_LIKELY(idApic < RT_ELEMENTS(pGip->aiCpuFromApicId)))
1052 {
1053 uint16_t iCpu = pGip->aiCpuFromApicId[idApic];
1054 if (RT_LIKELY(iCpu < pGip->cCpus))
1055 {
1056 PSUPGIPCPU pGipCpu = &pGip->aCPUs[iCpu];
1057
1058 /*
1059 * Apply the delta if valid.
1060 */
1061 if (RT_LIKELY(pGipCpu->i64TSCDelta != INT64_MAX))
1062 {
1063 *puTsc -= pGipCpu->i64TSCDelta;
1064 if (pfDeltaApplied)
1065 *pfDeltaApplied = true;
1066 return VINF_SUCCESS;
1067 }
1068
1069 rc = VINF_SUCCESS;
1070 }
1071 else
1072 {
1073 AssertMsgFailed(("iCpu=%u cCpus=%u\n", iCpu, pGip->cCpus));
1074 rc = VERR_INVALID_CPU_INDEX;
1075 }
1076 }
1077 else
1078 {
1079 AssertMsgFailed(("idApic=%u\n", idApic));
1080 rc = VERR_INVALID_CPU_ID;
1081 }
1082 if (pfDeltaApplied)
1083 *pfDeltaApplied = false;
1084 return rc;
1085}
1086
1087
1088/**
1089 * Measures the TSC frequency of the system.
1090 *
1091 * Uses a busy-wait method for the async. case as it is intended to help push
1092 * the CPU frequency up, while for the invariant cases using a sleeping method.
1093 *
1094 * The TSC frequency can vary on systems which are not reported as invariant.
1095 * On such systems the object of this function is to find out what the nominal,
1096 * maximum TSC frequency under 'normal' CPU operation.
1097 *
1098 * @returns VBox status code.
1099 * @param pDevExt Pointer to the device instance.
1100 *
1101 * @remarks Must be called only -after- measuring the TSC deltas.
1102 */
1103static int supdrvGipMeasureTscFreq(PSUPDRVDEVEXT pDevExt)
1104{
1105 int cTriesLeft = 4;
1106 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
1107
1108 /* Assert order. */
1109 AssertReturn(pGip, VERR_INVALID_PARAMETER);
1110 AssertReturn(pGip->u32Magic == SUPGLOBALINFOPAGE_MAGIC, VERR_WRONG_ORDER);
1111
1112 while (cTriesLeft-- > 0)
1113 {
1114 RTCCUINTREG uFlags;
1115 uint64_t u64NanoTsBefore;
1116 uint64_t u64NanoTsAfter;
1117 uint64_t u64TscBefore;
1118 uint64_t u64TscAfter;
1119 uint8_t idApicBefore;
1120 uint8_t idApicAfter;
1121
1122 /*
1123 * Synchronize with the host OS clock tick before reading the TSC.
1124 * Especially important on older Windows version where the granularity is terrible.
1125 */
1126 u64NanoTsBefore = RTTimeSystemNanoTS();
1127 while (RTTimeSystemNanoTS() == u64NanoTsBefore)
1128 ASMNopPause();
1129
1130 uFlags = ASMIntDisableFlags();
1131 idApicBefore = ASMGetApicId();
1132 u64TscBefore = ASMReadTSC();
1133 u64NanoTsBefore = RTTimeSystemNanoTS();
1134 ASMSetFlags(uFlags);
1135
1136 if (pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC)
1137 {
1138 /*
1139 * Sleep-wait since the TSC frequency is constant, it eases host load.
1140 * Shorter interval produces more variance in the frequency (esp. Windows).
1141 */
1142 RTThreadSleep(200);
1143 u64NanoTsAfter = RTTimeSystemNanoTS();
1144 while (RTTimeSystemNanoTS() == u64NanoTsAfter)
1145 ASMNopPause();
1146 u64NanoTsAfter = RTTimeSystemNanoTS();
1147 }
1148 else
1149 {
1150 /* Busy-wait keeping the frequency up and measure. */
1151 for (;;)
1152 {
1153 u64NanoTsAfter = RTTimeSystemNanoTS();
1154 if (u64NanoTsAfter < RT_NS_100MS + u64NanoTsBefore)
1155 ASMNopPause();
1156 else
1157 break;
1158 }
1159 }
1160
1161 uFlags = ASMIntDisableFlags();
1162 idApicAfter = ASMGetApicId();
1163 u64TscAfter = ASMReadTSC();
1164 ASMSetFlags(uFlags);
1165
1166 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_PRACTICALLY_ZERO)
1167 {
1168 int rc;
1169 bool fAppliedBefore;
1170 bool fAppliedAfter;
1171 rc = supdrvTscDeltaApply(pGip, &u64TscBefore, idApicBefore, &fAppliedBefore); AssertRCReturn(rc, rc);
1172 rc = supdrvTscDeltaApply(pGip, &u64TscAfter, idApicAfter, &fAppliedAfter); AssertRCReturn(rc, rc);
1173
1174 if ( !fAppliedBefore
1175 || !fAppliedAfter)
1176 {
1177#ifdef SUPDRV_USE_TSC_DELTA_THREAD
1178 /*
1179 * The TSC-delta measurements are kicked-off asynchronously as each host CPU is initialized.
1180 * Therefore, if we failed to have a delta for the CPU(s) we were scheduled on (idApicBefore
1181 * and idApicAfter) then wait until we have TSC-delta measurements for all online CPUs and
1182 * proceed. This should be triggered just once if we're rather unlucky.
1183 */
1184 rc = supdrvTscDeltaThreadWaitForOnlineCpus(pDevExt);
1185 if (rc == VERR_TIMEOUT)
1186 {
1187 SUPR0Printf("vboxdrv: supdrvGipMeasureTscFreq: timedout waiting for TSC-delta measurements.\n");
1188 return VERR_SUPDRV_TSC_FREQ_MEASUREMENT_FAILED;
1189 }
1190#else
1191 SUPR0Printf("vboxdrv: supdrvGipMeasureTscFreq: idApicBefore=%u idApicAfter=%u cTriesLeft=%u\n",
1192 idApicBefore, idApicAfter, cTriesLeft);
1193#endif
1194 continue;
1195 }
1196 }
1197
1198 /*
1199 * Update GIP.
1200 */
1201 pGip->u64CpuHz = ((u64TscAfter - u64TscBefore) * RT_NS_1SEC_64) / (u64NanoTsAfter - u64NanoTsBefore);
1202 if (pGip->u32Mode != SUPGIPMODE_ASYNC_TSC)
1203 pGip->aCPUs[0].u64CpuHz = pGip->u64CpuHz;
1204 return VINF_SUCCESS;
1205 }
1206
1207 return VERR_SUPDRV_TSC_FREQ_MEASUREMENT_FAILED;
1208}
1209
1210
1211/**
1212 * Timer callback function for TSC frequency refinement in invariant GIP mode.
1213 *
1214 * @param pTimer The timer.
1215 * @param pvUser Opaque pointer to the device instance data.
1216 * @param iTick The timer tick.
1217 */
1218static DECLCALLBACK(void) supdrvRefineTscTimer(PRTTIMER pTimer, void *pvUser, uint64_t iTick)
1219{
1220 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser;
1221 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
1222 bool fDeltaApplied = false;
1223 uint8_t idApic;
1224 uint64_t u64DeltaNanoTS;
1225 uint64_t u64DeltaTsc;
1226 uint64_t u64NanoTS;
1227 uint64_t u64Tsc;
1228 RTCCUINTREG uFlags;
1229
1230 /* Paranoia. */
1231 Assert(pGip);
1232 Assert(pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC);
1233
1234#if !defined(RT_OS_OS2) /* PORTME: Disable if timers are called from clock interrupt handler or with interrupts disabled. */
1235 u64NanoTS = RTTimeSystemNanoTS();
1236 while (RTTimeSystemNanoTS() == u64NanoTS)
1237 ASMNopPause();
1238#endif
1239 uFlags = ASMIntDisableFlags();
1240 idApic = ASMGetApicId();
1241 u64Tsc = ASMReadTSC();
1242 u64NanoTS = RTTimeSystemNanoTS();
1243 ASMSetFlags(uFlags);
1244 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_PRACTICALLY_ZERO)
1245 supdrvTscDeltaApply(pGip, &u64Tsc, idApic, &fDeltaApplied);
1246 u64DeltaNanoTS = u64NanoTS - pDevExt->u64NanoTSAnchor;
1247 u64DeltaTsc = u64Tsc - pDevExt->u64TscAnchor;
1248
1249 if (RT_UNLIKELY( pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_PRACTICALLY_ZERO
1250 && !fDeltaApplied))
1251 {
1252 Log(("vboxdrv: failed to refine TSC frequency as TSC-deltas unavailable after %d seconds!\n",
1253 GIP_TSC_REFINE_INTERVAL));
1254 return;
1255 }
1256
1257 /* Calculate the TSC frequency. */
1258 if ( u64DeltaTsc < UINT64_MAX / RT_NS_1SEC
1259 && u64DeltaNanoTS < UINT32_MAX)
1260 pGip->u64CpuHz = ASMMultU64ByU32DivByU32(u64DeltaTsc, RT_NS_1SEC, (uint32_t)u64DeltaNanoTS);
1261 else
1262 {
1263 RTUINT128U CpuHz, Tmp, Divisor;
1264 CpuHz.s.Lo = CpuHz.s.Hi = 0;
1265 RTUInt128MulU64ByU64(&Tmp, u64DeltaTsc, RT_NS_1SEC_64);
1266 RTUInt128Div(&CpuHz, &Tmp, RTUInt128AssignU64(&Divisor, u64DeltaNanoTS));
1267 pGip->u64CpuHz = CpuHz.s.Lo;
1268 }
1269
1270 /* Update rest of GIP. */
1271 Assert(pGip->u32Mode != SUPGIPMODE_ASYNC_TSC); /* See SUPGetCpuHzFromGIP().*/
1272 pGip->aCPUs[0].u64CpuHz = pGip->u64CpuHz;
1273}
1274
1275
1276/**
1277 * Starts the TSC-frequency refinement phase asynchronously.
1278 *
1279 * @param pDevExt Pointer to the device instance data.
1280 */
1281static void supdrvRefineTscFreq(PSUPDRVDEVEXT pDevExt)
1282{
1283 uint64_t u64NanoTS;
1284 RTCCUINTREG uFlags;
1285 uint8_t idApic;
1286 int rc;
1287 PSUPGLOBALINFOPAGE pGip;
1288
1289 /* Validate. */
1290 Assert(pDevExt);
1291 Assert(pDevExt->pGip);
1292 pGip = pDevExt->pGip;
1293
1294#ifdef SUPDRV_USE_TSC_DELTA_THREAD
1295 /*
1296 * If the TSC-delta thread is created, wait until it's done calculating
1297 * the TSC-deltas on the relevant online CPUs before we start the TSC refinement.
1298 */
1299 if ( pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED
1300 && ASMAtomicReadS32(&pDevExt->rcTscDelta) == VERR_NOT_AVAILABLE)
1301 {
1302 rc = supdrvTscDeltaThreadWaitForOnlineCpus(pDevExt);
1303 if (rc == VERR_TIMEOUT)
1304 {
1305 SUPR0Printf("vboxdrv: Skipping refinement of TSC frequency as TSC-delta measurement timed out!\n");
1306 return;
1307 }
1308 }
1309#endif
1310
1311 /*
1312 * Record the TSC and NanoTS as the starting anchor point for refinement of the
1313 * TSC. We deliberately avoid using SUPReadTSC() here as we want to keep the
1314 * reading of the TSC and the NanoTS as close as possible.
1315 */
1316 u64NanoTS = RTTimeSystemNanoTS();
1317 while (RTTimeSystemNanoTS() == u64NanoTS)
1318 ASMNopPause();
1319 uFlags = ASMIntDisableFlags();
1320 idApic = ASMGetApicId();
1321 pDevExt->u64TscAnchor = ASMReadTSC();
1322 pDevExt->u64NanoTSAnchor = RTTimeSystemNanoTS();
1323 ASMSetFlags(uFlags);
1324 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_PRACTICALLY_ZERO)
1325 supdrvTscDeltaApply(pGip, &pDevExt->u64TscAnchor, idApic, NULL /* pfDeltaApplied */);
1326
1327 rc = RTTimerCreateEx(&pDevExt->pTscRefineTimer, 0 /* one-shot */, RTTIMER_FLAGS_CPU_ANY, supdrvRefineTscTimer, pDevExt);
1328 if (RT_SUCCESS(rc))
1329 {
1330 /*
1331 * Refine the TSC frequency measurement over a long interval. Ideally, we want to keep the
1332 * interval as small as possible while gaining the most consistent and accurate frequency
1333 * (compared to what the host OS might have measured).
1334 *
1335 * In theory, we gain more accuracy with longer intervals, but we want VMs to startup with the
1336 * same TSC frequency whenever possible so we need to keep the interval short.
1337 */
1338 rc = RTTimerStart(pDevExt->pTscRefineTimer, GIP_TSC_REFINE_INTERVAL * RT_NS_1SEC_64);
1339 AssertRC(rc);
1340 }
1341 else
1342 OSDBGPRINT(("RTTimerCreateEx failed to create one-shot timer. rc=%Rrc\n", rc));
1343}
1344
1345
1346/**
1347 * Creates the GIP.
1348 *
1349 * @returns VBox status code.
1350 * @param pDevExt Instance data. GIP stuff may be updated.
1351 */
1352int VBOXCALL supdrvGipCreate(PSUPDRVDEVEXT pDevExt)
1353{
1354 PSUPGLOBALINFOPAGE pGip;
1355 RTHCPHYS HCPhysGip;
1356 uint32_t u32SystemResolution;
1357 uint32_t u32Interval;
1358 uint32_t u32MinInterval;
1359 uint32_t uMod;
1360 unsigned cCpus;
1361 int rc;
1362
1363 LogFlow(("supdrvGipCreate:\n"));
1364
1365 /* Assert order. */
1366 Assert(pDevExt->u32SystemTimerGranularityGrant == 0);
1367 Assert(pDevExt->GipMemObj == NIL_RTR0MEMOBJ);
1368 Assert(!pDevExt->pGipTimer);
1369
1370 /*
1371 * Check the CPU count.
1372 */
1373 cCpus = RTMpGetArraySize();
1374 if ( cCpus > RTCPUSET_MAX_CPUS
1375 || cCpus > 256 /* ApicId is used for the mappings */)
1376 {
1377 SUPR0Printf("VBoxDrv: Too many CPUs (%u) for the GIP (max %u)\n", cCpus, RT_MIN(RTCPUSET_MAX_CPUS, 256));
1378 return VERR_TOO_MANY_CPUS;
1379 }
1380
1381 /*
1382 * Allocate a contiguous set of pages with a default kernel mapping.
1383 */
1384 rc = RTR0MemObjAllocCont(&pDevExt->GipMemObj, RT_UOFFSETOF(SUPGLOBALINFOPAGE, aCPUs[cCpus]), false /*fExecutable*/);
1385 if (RT_FAILURE(rc))
1386 {
1387 OSDBGPRINT(("supdrvGipCreate: failed to allocate the GIP page. rc=%d\n", rc));
1388 return rc;
1389 }
1390 pGip = (PSUPGLOBALINFOPAGE)RTR0MemObjAddress(pDevExt->GipMemObj); AssertPtr(pGip);
1391 HCPhysGip = RTR0MemObjGetPagePhysAddr(pDevExt->GipMemObj, 0); Assert(HCPhysGip != NIL_RTHCPHYS);
1392
1393 /*
1394 * Allocate the TSC-delta sync struct on a separate cache line.
1395 */
1396 pDevExt->pvTscDeltaSync = RTMemAllocZ(sizeof(SUPTSCDELTASYNC) + 63);
1397 pDevExt->pTscDeltaSync = RT_ALIGN_PT(pDevExt->pvTscDeltaSync, 64, PSUPTSCDELTASYNC);
1398 Assert(RT_ALIGN_PT(pDevExt->pTscDeltaSync, 64, PSUPTSCDELTASYNC) == pDevExt->pTscDeltaSync);
1399
1400 /*
1401 * Find a reasonable update interval and initialize the structure.
1402 */
1403 supdrvGipRequestHigherTimerFrequencyFromSystem(pDevExt);
1404 /** @todo figure out why using a 100Ms interval upsets timekeeping in VMs.
1405 * See @bugref{6710}. */
1406 u32MinInterval = RT_NS_10MS;
1407 u32SystemResolution = RTTimerGetSystemGranularity();
1408 u32Interval = u32MinInterval;
1409 uMod = u32MinInterval % u32SystemResolution;
1410 if (uMod)
1411 u32Interval += u32SystemResolution - uMod;
1412
1413 supdrvGipInit(pDevExt, pGip, HCPhysGip, RTTimeSystemNanoTS(), RT_NS_1SEC / u32Interval /*=Hz*/, u32Interval, cCpus);
1414
1415 if (RT_UNLIKELY( pGip->enmUseTscDelta == SUPGIPUSETSCDELTA_ZERO_CLAIMED
1416 && pGip->u32Mode == SUPGIPMODE_ASYNC_TSC
1417 && !supdrvOSGetForcedAsyncTscMode(pDevExt)))
1418 {
1419 /* Basically, invariant Windows boxes, should never be detected as async (i.e. TSC-deltas should be 0). */
1420 OSDBGPRINT(("supdrvGipCreate: The TSC-deltas should be normalized by the host OS, but verifying shows it's not!\n"));
1421 return VERR_INTERNAL_ERROR_2;
1422 }
1423
1424 RTCpuSetEmpty(&pDevExt->TscDeltaCpuSet);
1425 RTCpuSetEmpty(&pDevExt->TscDeltaObtainedCpuSet);
1426#ifdef SUPDRV_USE_TSC_DELTA_THREAD
1427 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
1428 {
1429 /* Initialize TSC-delta measurement thread before executing any Mp event callbacks. */
1430 rc = supdrvTscDeltaThreadInit(pDevExt);
1431 }
1432#endif
1433 if (RT_SUCCESS(rc))
1434 {
1435 rc = RTMpNotificationRegister(supdrvGipMpEvent, pDevExt);
1436 if (RT_SUCCESS(rc))
1437 {
1438 rc = RTMpOnAll(supdrvGipInitOnCpu, pDevExt, pGip);
1439 if (RT_SUCCESS(rc))
1440 {
1441#ifndef SUPDRV_USE_TSC_DELTA_THREAD
1442 uint16_t iCpu;
1443 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
1444 {
1445 /*
1446 * Measure the TSC deltas now that we have MP notifications.
1447 */
1448 int cTries = 5;
1449 do
1450 {
1451 rc = supdrvMeasureInitialTscDeltas(pDevExt);
1452 if ( rc != VERR_TRY_AGAIN
1453 && rc != VERR_CPU_OFFLINE)
1454 break;
1455 } while (--cTries > 0);
1456 for (iCpu = 0; iCpu < pGip->cCpus; iCpu++)
1457 Log(("supdrvTscDeltaInit: cpu[%u] delta %lld\n", iCpu, pGip->aCPUs[iCpu].i64TSCDelta));
1458 }
1459 else
1460 {
1461 for (iCpu = 0; iCpu < pGip->cCpus; iCpu++)
1462 AssertMsg(!pGip->aCPUs[iCpu].i64TSCDelta, ("iCpu=%u %lld mode=%d\n", iCpu, pGip->aCPUs[iCpu].i64TSCDelta, pGip->u32Mode));
1463 }
1464#endif
1465 if (RT_SUCCESS(rc))
1466 {
1467 rc = supdrvGipMeasureTscFreq(pDevExt);
1468 if (RT_SUCCESS(rc))
1469 {
1470 /*
1471 * Create the timer.
1472 * If CPU_ALL isn't supported we'll have to fall back to synchronous mode.
1473 */
1474 if (pGip->u32Mode == SUPGIPMODE_ASYNC_TSC)
1475 {
1476 rc = RTTimerCreateEx(&pDevExt->pGipTimer, u32Interval, RTTIMER_FLAGS_CPU_ALL, supdrvGipAsyncTimer,
1477 pDevExt);
1478 if (rc == VERR_NOT_SUPPORTED)
1479 {
1480 OSDBGPRINT(("supdrvGipCreate: omni timer not supported, falling back to synchronous mode\n"));
1481 pGip->u32Mode = SUPGIPMODE_SYNC_TSC;
1482 }
1483 }
1484 if (pGip->u32Mode != SUPGIPMODE_ASYNC_TSC)
1485 rc = RTTimerCreateEx(&pDevExt->pGipTimer, u32Interval, 0 /* fFlags */,
1486 supdrvGipSyncAndInvariantTimer, pDevExt);
1487 if (RT_SUCCESS(rc))
1488 {
1489 /*
1490 * We're good.
1491 */
1492 Log(("supdrvGipCreate: %u ns interval.\n", u32Interval));
1493 supdrvGipReleaseHigherTimerFrequencyFromSystem(pDevExt);
1494
1495 g_pSUPGlobalInfoPage = pGip;
1496 if (pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC)
1497 supdrvRefineTscFreq(pDevExt);
1498 return VINF_SUCCESS;
1499 }
1500
1501 OSDBGPRINT(("supdrvGipCreate: failed create GIP timer at %u ns interval. rc=%Rrc\n", u32Interval, rc));
1502 Assert(!pDevExt->pGipTimer);
1503 }
1504 else
1505 OSDBGPRINT(("supdrvGipCreate: supdrvGipMeasureTscFreq failed. rc=%Rrc\n", rc));
1506 }
1507 else
1508 OSDBGPRINT(("supdrvGipCreate: supdrvMeasureInitialTscDeltas failed. rc=%Rrc\n", rc));
1509 }
1510 else
1511 OSDBGPRINT(("supdrvGipCreate: RTMpOnAll failed. rc=%Rrc\n", rc));
1512 }
1513 else
1514 OSDBGPRINT(("supdrvGipCreate: failed to register MP event notfication. rc=%Rrc\n", rc));
1515 }
1516 else
1517 OSDBGPRINT(("supdrvGipCreate: supdrvTscDeltaInit failed. rc=%Rrc\n", rc));
1518
1519 supdrvGipDestroy(pDevExt); /* Releases timer frequency increase too. */
1520 return rc;
1521}
1522
1523
1524/**
1525 * Terminates the GIP.
1526 *
1527 * @param pDevExt Instance data. GIP stuff may be updated.
1528 */
1529void VBOXCALL supdrvGipDestroy(PSUPDRVDEVEXT pDevExt)
1530{
1531 int rc;
1532#ifdef DEBUG_DARWIN_GIP
1533 OSDBGPRINT(("supdrvGipDestroy: pDevExt=%p pGip=%p pGipTimer=%p GipMemObj=%p\n", pDevExt,
1534 pDevExt->GipMemObj != NIL_RTR0MEMOBJ ? RTR0MemObjAddress(pDevExt->GipMemObj) : NULL,
1535 pDevExt->pGipTimer, pDevExt->GipMemObj));
1536#endif
1537
1538 /*
1539 * Stop receiving MP notifications before tearing anything else down.
1540 */
1541 RTMpNotificationDeregister(supdrvGipMpEvent, pDevExt);
1542
1543#ifdef SUPDRV_USE_TSC_DELTA_THREAD
1544 /*
1545 * Terminate the TSC-delta measurement thread and resources.
1546 */
1547 supdrvTscDeltaTerm(pDevExt);
1548#endif
1549
1550 /*
1551 * Destroy the TSC-refinement one-shot timer.
1552 */
1553 if (pDevExt->pTscRefineTimer)
1554 {
1555 RTTimerDestroy(pDevExt->pTscRefineTimer);
1556 pDevExt->pTscRefineTimer = NULL;
1557 }
1558
1559 if (pDevExt->pvTscDeltaSync)
1560 {
1561 RTMemFree(pDevExt->pvTscDeltaSync);
1562 pDevExt->pTscDeltaSync = NULL;
1563 pDevExt->pvTscDeltaSync = NULL;
1564 }
1565
1566 /*
1567 * Invalid the GIP data.
1568 */
1569 if (pDevExt->pGip)
1570 {
1571 supdrvGipTerm(pDevExt->pGip);
1572 pDevExt->pGip = NULL;
1573 }
1574 g_pSUPGlobalInfoPage = NULL;
1575
1576 /*
1577 * Destroy the timer and free the GIP memory object.
1578 */
1579 if (pDevExt->pGipTimer)
1580 {
1581 rc = RTTimerDestroy(pDevExt->pGipTimer); AssertRC(rc);
1582 pDevExt->pGipTimer = NULL;
1583 }
1584
1585 if (pDevExt->GipMemObj != NIL_RTR0MEMOBJ)
1586 {
1587 rc = RTR0MemObjFree(pDevExt->GipMemObj, true /* free mappings */); AssertRC(rc);
1588 pDevExt->GipMemObj = NIL_RTR0MEMOBJ;
1589 }
1590
1591 /*
1592 * Finally, make sure we've release the system timer resolution request
1593 * if one actually succeeded and is still pending.
1594 */
1595 supdrvGipReleaseHigherTimerFrequencyFromSystem(pDevExt);
1596}
1597
1598
1599/**
1600 * Timer callback function for the sync and invariant GIP modes.
1601 *
1602 * @param pTimer The timer.
1603 * @param pvUser Opaque pointer to the device extension.
1604 * @param iTick The timer tick.
1605 */
1606static DECLCALLBACK(void) supdrvGipSyncAndInvariantTimer(PRTTIMER pTimer, void *pvUser, uint64_t iTick)
1607{
1608 RTCCUINTREG uFlags;
1609 uint64_t u64TSC;
1610 uint64_t u64NanoTS;
1611 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser;
1612 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
1613
1614 uFlags = ASMIntDisableFlags(); /* No interruptions please (real problem on S10). */
1615 u64TSC = ASMReadTSC();
1616 u64NanoTS = RTTimeSystemNanoTS();
1617
1618 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_PRACTICALLY_ZERO)
1619 {
1620 /*
1621 * The calculations in supdrvGipUpdate() is very timing sensitive and doesn't handle
1622 * missed timer ticks. So for now it is better to use a delta of 0 and have the TSC rate
1623 * affected a bit until we get proper TSC deltas than implementing options like
1624 * rescheduling the tick to be delivered on the right CPU or missing the tick entirely.
1625 *
1626 * The likely hood of this happening is really low. On Windows, Linux, and Solaris
1627 * timers fire on the CPU they were registered/started on. Darwin timers doesn't
1628 * necessarily (they are high priority threads waiting).
1629 */
1630 Assert(!ASMIntAreEnabled());
1631 supdrvTscDeltaApply(pGip, &u64TSC, ASMGetApicId(), NULL /* pfDeltaApplied */);
1632 }
1633
1634 supdrvGipUpdate(pDevExt, u64NanoTS, u64TSC, NIL_RTCPUID, iTick);
1635
1636 ASMSetFlags(uFlags);
1637
1638#ifdef SUPDRV_USE_TSC_DELTA_THREAD
1639 if ( pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED
1640 && !RTCpuSetIsEmpty(&pDevExt->TscDeltaCpuSet))
1641 {
1642 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
1643 if ( pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_Listening
1644 || pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_Measuring)
1645 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_WaitAndMeasure;
1646 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
1647 /** @todo Do the actual poking using -- RTThreadUserSignal() */
1648 }
1649#endif
1650}
1651
1652
1653/**
1654 * Timer callback function for async GIP mode.
1655 * @param pTimer The timer.
1656 * @param pvUser Opaque pointer to the device extension.
1657 * @param iTick The timer tick.
1658 */
1659static DECLCALLBACK(void) supdrvGipAsyncTimer(PRTTIMER pTimer, void *pvUser, uint64_t iTick)
1660{
1661 RTCCUINTREG fOldFlags = ASMIntDisableFlags(); /* No interruptions please (real problem on S10). */
1662 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser;
1663 RTCPUID idCpu = RTMpCpuId();
1664 uint64_t u64TSC = ASMReadTSC();
1665 uint64_t NanoTS = RTTimeSystemNanoTS();
1666
1667 /** @todo reset the transaction number and whatnot when iTick == 1. */
1668 if (pDevExt->idGipMaster == idCpu)
1669 supdrvGipUpdate(pDevExt, NanoTS, u64TSC, idCpu, iTick);
1670 else
1671 supdrvGipUpdatePerCpu(pDevExt, NanoTS, u64TSC, idCpu, ASMGetApicId(), iTick);
1672
1673 ASMSetFlags(fOldFlags);
1674}
1675
1676
1677/**
1678 * Finds our (@a idCpu) entry, or allocates a new one if not found.
1679 *
1680 * @returns Index of the CPU in the cache set.
1681 * @param pGip The GIP.
1682 * @param idCpu The CPU ID.
1683 */
1684static uint32_t supdrvGipFindOrAllocCpuIndexForCpuId(PSUPGLOBALINFOPAGE pGip, RTCPUID idCpu)
1685{
1686 uint32_t i, cTries;
1687
1688 /*
1689 * ASSUMES that CPU IDs are constant.
1690 */
1691 for (i = 0; i < pGip->cCpus; i++)
1692 if (pGip->aCPUs[i].idCpu == idCpu)
1693 return i;
1694
1695 cTries = 0;
1696 do
1697 {
1698 for (i = 0; i < pGip->cCpus; i++)
1699 {
1700 bool fRc;
1701 ASMAtomicCmpXchgSize(&pGip->aCPUs[i].idCpu, idCpu, NIL_RTCPUID, fRc);
1702 if (fRc)
1703 return i;
1704 }
1705 } while (cTries++ < 32);
1706 AssertReleaseFailed();
1707 return i - 1;
1708}
1709
1710
1711/**
1712 * Finds the GIP CPU index corresponding to @a idCpu.
1713 *
1714 * @returns GIP CPU array index, UINT32_MAX if not found.
1715 * @param pGip The GIP.
1716 * @param idCpu The CPU ID.
1717 */
1718static uint32_t supdrvGipFindCpuIndexForCpuId(PSUPGLOBALINFOPAGE pGip, RTCPUID idCpu)
1719{
1720 uint32_t i;
1721 for (i = 0; i < pGip->cCpus; i++)
1722 if (pGip->aCPUs[i].idCpu == idCpu)
1723 return i;
1724 return UINT32_MAX;
1725}
1726
1727
1728/**
1729 * The calling CPU should be accounted as online, update GIP accordingly.
1730 *
1731 * This is used by supdrvGipCreate() as well as supdrvGipMpEvent().
1732 *
1733 * @param pDevExt The device extension.
1734 * @param idCpu The CPU ID.
1735 */
1736static void supdrvGipMpEventOnline(PSUPDRVDEVEXT pDevExt, RTCPUID idCpu)
1737{
1738 int iCpuSet = 0;
1739 uint16_t idApic = UINT16_MAX;
1740 uint32_t i = 0;
1741 uint64_t u64NanoTS = 0;
1742 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
1743
1744 AssertPtrReturnVoid(pGip);
1745 AssertRelease(idCpu == RTMpCpuId());
1746 Assert(pGip->cPossibleCpus == RTMpGetCount());
1747
1748 /*
1749 * Do this behind a spinlock with interrupts disabled as this can fire
1750 * on all CPUs simultaneously, see @bugref{6110}.
1751 */
1752 RTSpinlockAcquire(pDevExt->hGipSpinlock);
1753
1754 /*
1755 * Update the globals.
1756 */
1757 ASMAtomicWriteU16(&pGip->cPresentCpus, RTMpGetPresentCount());
1758 ASMAtomicWriteU16(&pGip->cOnlineCpus, RTMpGetOnlineCount());
1759 iCpuSet = RTMpCpuIdToSetIndex(idCpu);
1760 if (iCpuSet >= 0)
1761 {
1762 Assert(RTCpuSetIsMemberByIndex(&pGip->PossibleCpuSet, iCpuSet));
1763 RTCpuSetAddByIndex(&pGip->OnlineCpuSet, iCpuSet);
1764 RTCpuSetAddByIndex(&pGip->PresentCpuSet, iCpuSet);
1765 }
1766
1767 /*
1768 * Update the entry.
1769 */
1770 u64NanoTS = RTTimeSystemNanoTS() - pGip->u32UpdateIntervalNS;
1771 i = supdrvGipFindOrAllocCpuIndexForCpuId(pGip, idCpu);
1772 supdrvGipInitCpu(pDevExt, pGip, &pGip->aCPUs[i], u64NanoTS);
1773 idApic = ASMGetApicId();
1774 ASMAtomicWriteU16(&pGip->aCPUs[i].idApic, idApic);
1775 ASMAtomicWriteS16(&pGip->aCPUs[i].iCpuSet, (int16_t)iCpuSet);
1776 ASMAtomicWriteSize(&pGip->aCPUs[i].idCpu, idCpu);
1777
1778 /*
1779 * Update the APIC ID and CPU set index mappings.
1780 */
1781 ASMAtomicWriteU16(&pGip->aiCpuFromApicId[idApic], i);
1782 ASMAtomicWriteU16(&pGip->aiCpuFromCpuSetIdx[iCpuSet], i);
1783
1784 /* Update the Mp online/offline counter. */
1785 ASMAtomicIncU32(&pDevExt->cMpOnOffEvents);
1786
1787 /* Add this CPU to the set of CPUs for which we need to calculate their TSC-deltas. */
1788 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
1789 {
1790 RTCpuSetAddByIndex(&pDevExt->TscDeltaCpuSet, iCpuSet);
1791#ifdef SUPDRV_USE_TSC_DELTA_THREAD
1792 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
1793 if ( pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_Listening
1794 || pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_Measuring)
1795 {
1796 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_WaitAndMeasure;
1797 }
1798 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
1799#endif
1800 }
1801
1802 /* commit it */
1803 ASMAtomicWriteSize(&pGip->aCPUs[i].enmState, SUPGIPCPUSTATE_ONLINE);
1804
1805 RTSpinlockRelease(pDevExt->hGipSpinlock);
1806}
1807
1808
1809/**
1810 * The CPU should be accounted as offline, update the GIP accordingly.
1811 *
1812 * This is used by supdrvGipMpEvent.
1813 *
1814 * @param pDevExt The device extension.
1815 * @param idCpu The CPU ID.
1816 */
1817static void supdrvGipMpEventOffline(PSUPDRVDEVEXT pDevExt, RTCPUID idCpu)
1818{
1819 int iCpuSet;
1820 unsigned i;
1821
1822 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
1823
1824 AssertPtrReturnVoid(pGip);
1825 RTSpinlockAcquire(pDevExt->hGipSpinlock);
1826
1827 iCpuSet = RTMpCpuIdToSetIndex(idCpu);
1828 AssertReturnVoid(iCpuSet >= 0);
1829
1830 i = pGip->aiCpuFromCpuSetIdx[iCpuSet];
1831 AssertReturnVoid(i < pGip->cCpus);
1832 AssertReturnVoid(pGip->aCPUs[i].idCpu == idCpu);
1833
1834 Assert(RTCpuSetIsMemberByIndex(&pGip->PossibleCpuSet, iCpuSet));
1835 RTCpuSetDelByIndex(&pGip->OnlineCpuSet, iCpuSet);
1836
1837 /* Update the Mp online/offline counter. */
1838 ASMAtomicIncU32(&pDevExt->cMpOnOffEvents);
1839
1840 /* If we are the initiator going offline while measuring the TSC delta, unspin other waiting CPUs! */
1841 if (ASMAtomicReadU32(&pDevExt->idTscDeltaInitiator) == idCpu)
1842 {
1843 ASMAtomicWriteU32(&pDevExt->pTscDeltaSync->u, GIP_TSC_DELTA_SYNC_START);
1844 ASMAtomicWriteU64(&pGip->aCPUs[i].u64TSCSample, ~GIP_TSC_DELTA_RSVD);
1845 }
1846
1847 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
1848 {
1849 /* Reset the TSC delta, we will recalculate it lazily. */
1850 ASMAtomicWriteS64(&pGip->aCPUs[i].i64TSCDelta, INT64_MAX);
1851 /* Remove this CPU from the set of CPUs that we have obtained the TSC deltas. */
1852 RTCpuSetDelByIndex(&pDevExt->TscDeltaObtainedCpuSet, iCpuSet);
1853 }
1854
1855 /* commit it */
1856 ASMAtomicWriteSize(&pGip->aCPUs[i].enmState, SUPGIPCPUSTATE_OFFLINE);
1857
1858 RTSpinlockRelease(pDevExt->hGipSpinlock);
1859}
1860
1861
1862/**
1863 * Multiprocessor event notification callback.
1864 *
1865 * This is used to make sure that the GIP master gets passed on to
1866 * another CPU. It also updates the associated CPU data.
1867 *
1868 * @param enmEvent The event.
1869 * @param idCpu The cpu it applies to.
1870 * @param pvUser Pointer to the device extension.
1871 *
1872 * @remarks This function -must- fire on the newly online'd CPU for the
1873 * RTMPEVENT_ONLINE case and can fire on any CPU for the
1874 * RTMPEVENT_OFFLINE case.
1875 */
1876static DECLCALLBACK(void) supdrvGipMpEvent(RTMPEVENT enmEvent, RTCPUID idCpu, void *pvUser)
1877{
1878 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser;
1879 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
1880
1881 AssertRelease(!RTThreadPreemptIsEnabled(NIL_RTTHREAD));
1882
1883 /*
1884 * Update the GIP CPU data.
1885 */
1886 if (pGip)
1887 {
1888 switch (enmEvent)
1889 {
1890 case RTMPEVENT_ONLINE:
1891 AssertRelease(idCpu == RTMpCpuId());
1892 supdrvGipMpEventOnline(pDevExt, idCpu);
1893 break;
1894 case RTMPEVENT_OFFLINE:
1895 supdrvGipMpEventOffline(pDevExt, idCpu);
1896 break;
1897 }
1898 }
1899
1900 /*
1901 * Make sure there is a master GIP.
1902 */
1903 if (enmEvent == RTMPEVENT_OFFLINE)
1904 {
1905 RTCPUID idGipMaster = ASMAtomicReadU32(&pDevExt->idGipMaster);
1906 if (idGipMaster == idCpu)
1907 {
1908 /*
1909 * The GIP master is going offline, find a new one.
1910 */
1911 bool fIgnored;
1912 unsigned i;
1913 RTCPUID idNewGipMaster = NIL_RTCPUID;
1914 RTCPUSET OnlineCpus;
1915 RTMpGetOnlineSet(&OnlineCpus);
1916
1917 for (i = 0; i < RTCPUSET_MAX_CPUS; i++)
1918 if (RTCpuSetIsMemberByIndex(&OnlineCpus, i))
1919 {
1920 RTCPUID idCurCpu = RTMpCpuIdFromSetIndex(i);
1921 if (idCurCpu != idGipMaster)
1922 {
1923 idNewGipMaster = idCurCpu;
1924 break;
1925 }
1926 }
1927
1928 Log(("supdrvGipMpEvent: Gip master %#lx -> %#lx\n", (long)idGipMaster, (long)idNewGipMaster));
1929 ASMAtomicCmpXchgSize(&pDevExt->idGipMaster, idNewGipMaster, idGipMaster, fIgnored);
1930 NOREF(fIgnored);
1931 }
1932 }
1933}
1934
1935
1936/*
1937 * Select TSC delta measurement algorithm.
1938 */
1939#if 1
1940# define GIP_TSC_DELTA_METHOD_1
1941#else
1942# define GIP_TSC_DELTA_METHOD_2
1943#endif
1944
1945
1946#ifdef GIP_TSC_DELTA_METHOD_2
1947
1948/**
1949 * TSC delta measurment algorithm \#2 result entry.
1950 */
1951typedef struct SUPDRVTSCDELTAMETHOD2ENTRY
1952{
1953 uint32_t iSeqMine;
1954 uint32_t iSeqOther;
1955 uint64_t uTsc;
1956} SUPDRVTSCDELTAMETHOD2ENTRY;
1957
1958/**
1959 * TSC delta measurment algorithm \#2 Data.
1960 */
1961typedef struct SUPDRVTSCDELTAMETHOD2
1962{
1963 /** Padding to make sure the iCurSeqNo is in its own cache line.
1964 * ASSUMES cacheline sizes <= 128 bytes. */
1965 uint32_t au32CacheLinePaddingBefore[128 / sizeof(uint32_t)];
1966 /** The current sequence number of this worker. */
1967 uint32_t volatile iCurSeqNo;
1968 /** Padding to make sure the iCurSeqNo is in its own cache line.
1969 * ASSUMES cacheline sizes <= 128 bytes. */
1970 uint32_t au32CacheLinePaddingAfter[128 / sizeof(uint32_t) - 1];
1971 /** Result table. */
1972 SUPDRVTSCDELTAMETHOD2ENTRY aResults[96];
1973} SUPDRVTSCDELTAMETHOD2;
1974/** Pointer to the data for TSC delta mesurment algorithm \#2 .*/
1975typedef SUPDRVTSCDELTAMETHOD2 *PSUPDRVTSCDELTAMETHOD2;
1976
1977#endif /* GIP_TSC_DELTA_METHOD_2 */
1978
1979/**
1980 * Argument package/state passed by supdrvMeasureTscDeltaOne to the RTMpOn
1981 * callback worker.
1982 */
1983typedef struct SUPDRVGIPTSCDELTARGS
1984{
1985 PSUPDRVDEVEXT pDevExt;
1986 PSUPGIPCPU pWorker;
1987 PSUPGIPCPU pMaster;
1988 RTCPUID idMaster;
1989#ifdef GIP_TSC_DELTA_METHOD_2
1990 PSUPDRVTSCDELTAMETHOD2 pMasterData;
1991 PSUPDRVTSCDELTAMETHOD2 pWorkerData;
1992 uint32_t cHits;
1993 /*uint32_t cOffByOne;*/
1994 uint32_t iAttempt; /**< 1-base outer loop counter. */
1995 bool fLagMaster;
1996 bool fLagWorker;
1997#endif
1998} SUPDRVGIPTSCDELTARGS;
1999typedef SUPDRVGIPTSCDELTARGS *PSUPDRVGIPTSCDELTARGS;
2000
2001
2002#ifdef GIP_TSC_DELTA_METHOD_2
2003/*
2004 * TSC delta measurement algorithm \#2 configuration and code - Experimental!!
2005 */
2006# undef GIP_TSC_DELTA_LOOPS
2007# undef GIP_TSC_DELTA_READ_TIME_LOOPS
2008# undef GIP_TSC_DELTA_PRIMER_LOOPS
2009# define GIP_TSC_DELTA_LOOPS 17
2010# define GIP_TSC_DELTA_PRIMER_LOOPS 1
2011# define GIP_TSC_DELTA_READ_TIME_LOOPS GIP_TSC_DELTA_PRIMER_LOOPS /* no read-time-loops necessary */
2012
2013
2014static int supdrvTscDeltaMethod2Init(PSUPDRVGIPTSCDELTARGS pArgs)
2015{
2016 uint32_t const fFlags = /*RTMEMALLOCEX_FLAGS_ANY_CTX |*/ RTMEMALLOCEX_FLAGS_ZEROED;
2017 int rc = RTMemAllocEx(sizeof(*pArgs->pMasterData), 0, fFlags, (void **)&pArgs->pWorkerData);
2018 if (RT_SUCCESS(rc))
2019 rc = RTMemAllocEx(sizeof(*pArgs->pMasterData), 0, fFlags, (void **)&pArgs->pMasterData);
2020 return rc;
2021}
2022
2023
2024static void supdrvTscDeltaMethod2Term(PSUPDRVGIPTSCDELTARGS pArgs)
2025{
2026 RTMemFreeEx(pArgs->pMasterData, sizeof(*pArgs->pMasterData));
2027 RTMemFreeEx(pArgs->pWorkerData, sizeof(*pArgs->pWorkerData));
2028 /*SUPR0Printf("cHits=%d cOffByOne=%d m=%d w=%d\n", pArgs->cHits, pArgs->cOffByOne, pArgs->pMaster->idApic, pArgs->pWorker->idApic);*/
2029}
2030
2031
2032static void supdrvTscDeltaMethod2Looped(PSUPDRVGIPTSCDELTARGS pArgs, RTCPUID idCpu, unsigned iLoop)
2033{
2034 if (pArgs->idMaster == idCpu)
2035 {
2036 if (iLoop < GIP_TSC_DELTA_PRIMER_LOOPS)
2037 {
2038 if (iLoop == 0)
2039 pArgs->iAttempt++;
2040
2041 /* Lag during the priming to be nice to everyone.. */
2042 pArgs->fLagMaster = true;
2043 pArgs->fLagWorker = true;
2044 }
2045 else if (iLoop < (GIP_TSC_DELTA_LOOPS - GIP_TSC_DELTA_PRIMER_LOOPS) / 4)
2046 {
2047 /* 25 % of the body without lagging. */
2048 pArgs->fLagMaster = false;
2049 pArgs->fLagWorker = false;
2050 }
2051 else if (iLoop < (GIP_TSC_DELTA_LOOPS - GIP_TSC_DELTA_PRIMER_LOOPS) / 4 * 2)
2052 {
2053 /* 25 % of the body with both lagging. */
2054 pArgs->fLagMaster = true;
2055 pArgs->fLagWorker = true;
2056 }
2057 else
2058 {
2059 /* 50% of the body with alternating lag. */
2060 pArgs->fLagMaster = (iLoop & 1) == 0;
2061 pArgs->fLagWorker = (iLoop & 1) == 1;
2062 }
2063 }
2064}
2065
2066
2067/**
2068 * The core function of the 2nd TSC delta mesurment algorithm.
2069 *
2070 * The idea here is that we have the two CPUs execute the exact same code
2071 * collecting a largish set of TSC samples. The code has one data dependency on
2072 * the other CPU which intention it is to synchronize the execution as well as
2073 * help cross references the two sets of TSC samples (the sequence numbers).
2074 *
2075 * The @a fLag parameter is used to modify the execution a tiny bit on one or
2076 * both of the CPUs. When @a fLag differs between the CPUs, it is thought that
2077 * it will help with making the CPUs enter lock step execution occationally.
2078 *
2079 */
2080static void supdrvTscDeltaMethod2CollectData(PSUPDRVTSCDELTAMETHOD2 pMyData, uint32_t volatile *piOtherSeqNo, bool fLag)
2081{
2082 SUPDRVTSCDELTAMETHOD2ENTRY *pEntry = &pMyData->aResults[0];
2083 uint32_t cLeft = RT_ELEMENTS(pMyData->aResults);
2084
2085 ASMAtomicWriteU32(&pMyData->iCurSeqNo, 0);
2086 ASMSerializeInstruction();
2087 while (cLeft-- > 0)
2088 {
2089 uint64_t uTsc;
2090 uint32_t iSeqMine = ASMAtomicIncU32(&pMyData->iCurSeqNo);
2091 uint32_t iSeqOther = ASMAtomicReadU32(piOtherSeqNo);
2092 ASMCompilerBarrier();
2093 ASMSerializeInstruction(); /* Way better result than with ASMMemoryFenceSSE2() in this position! */
2094 uTsc = ASMReadTSC();
2095 ASMAtomicIncU32(&pMyData->iCurSeqNo);
2096 ASMCompilerBarrier();
2097 ASMSerializeInstruction();
2098 pEntry->iSeqMine = iSeqMine;
2099 pEntry->iSeqOther = iSeqOther;
2100 pEntry->uTsc = uTsc;
2101 pEntry++;
2102 ASMSerializeInstruction();
2103 if (fLag)
2104 ASMNopPause();
2105 }
2106}
2107
2108
2109static void supdrvTscDeltaMethod2ProcessDataSet(PSUPDRVGIPTSCDELTARGS pArgs, PSUPDRVTSCDELTAMETHOD2 pMyData,
2110 bool fIsMaster, uint32_t cResults,
2111 PSUPDRVTSCDELTAMETHOD2 pOtherData, int64_t iMasterTscDelta,
2112 int64_t volatile *piWorkerTscDelta)
2113{
2114 uint32_t cHits = 0;
2115#if 0
2116 uint32_t cOffByOne = 0;
2117#endif
2118 uint32_t idxResult = 0;
2119 int64_t iBestDelta = *piWorkerTscDelta;
2120
2121 if (cResults > RT_ELEMENTS(pMyData->aResults))
2122 cResults = RT_ELEMENTS(pMyData->aResults);
2123
2124 for (idxResult = 0; idxResult < cResults; idxResult++)
2125 {
2126 uint32_t idxOther = pMyData->aResults[idxResult].iSeqOther;
2127 if (idxOther & 1)
2128 {
2129 idxOther >>= 1;
2130 if (idxOther < RT_ELEMENTS(pOtherData->aResults))
2131 {
2132 if (pOtherData->aResults[idxOther].iSeqOther == pMyData->aResults[idxResult].iSeqMine)
2133 {
2134 int64_t iDelta;
2135 if (fIsMaster)
2136 iDelta = pOtherData->aResults[idxOther].uTsc
2137 - (pMyData->aResults[idxResult].uTsc - iMasterTscDelta);
2138 else
2139 iDelta = (pOtherData->aResults[idxResult].uTsc - iMasterTscDelta)
2140 - pMyData->aResults[idxOther].uTsc;
2141 if ( iDelta >= GIP_TSC_DELTA_INITIAL_MASTER_VALUE
2142 ? iDelta < iBestDelta
2143 : iDelta > iBestDelta || iBestDelta == INT64_MAX)
2144 iBestDelta = iDelta;
2145 cHits++;
2146 }
2147 }
2148 }
2149#if 0 /* Can be used to detect battles between threads on the same core. Decided to change the master instead. */
2150 else
2151 {
2152 idxOther >>= 1;
2153 if ( idxOther < RT_ELEMENTS(pOtherData->aResults)
2154 && pOtherData->aResults[idxOther].iSeqOther == pMyData->aResults[idxResult].iSeqMine)
2155 cOffByOne++;
2156 }
2157#endif
2158 }
2159
2160 if (cHits > 0)
2161 *piWorkerTscDelta = iBestDelta;
2162 pArgs->cHits += cHits;
2163#if 0
2164 pArgs->cOffByOne += cOffByOne;
2165#endif
2166}
2167
2168
2169static void supdrvTscDeltaMethod2ProcessDataOnMaster(PSUPDRVGIPTSCDELTARGS pArgs, bool fFinalLoop)
2170{
2171 supdrvTscDeltaMethod2ProcessDataSet(pArgs,
2172 pArgs->pMasterData,
2173 true /*fIsMaster*/,
2174 RT_ELEMENTS(pArgs->pMasterData->aResults),
2175 pArgs->pWorkerData,
2176 pArgs->pMaster->i64TSCDelta,
2177 &pArgs->pWorker->i64TSCDelta);
2178
2179 supdrvTscDeltaMethod2ProcessDataSet(pArgs,
2180 pArgs->pWorkerData,
2181 false /*fIsMaster*/,
2182 ASMAtomicReadU32(&pArgs->pWorkerData->iCurSeqNo) >> 1,
2183 pArgs->pMasterData,
2184 pArgs->pMaster->i64TSCDelta,
2185 &pArgs->pWorker->i64TSCDelta);
2186}
2187
2188#endif /* GIP_TSC_DELTA_METHOD_2 */
2189
2190
2191/**
2192 * Callback used by supdrvMeasureInitialTscDeltas() to read the TSC on two CPUs
2193 * and compute the delta between them.
2194 *
2195 * @param idCpu The CPU we are current scheduled on.
2196 * @param pvUser1 Pointer to a parameter package (SUPDRVGIPTSCDELTARGS).
2197 * @param pvUser2 Unused.
2198 *
2199 * @remarks Measuring TSC deltas between the CPUs is tricky because we need to
2200 * read the TSC at exactly the same time on both the master and the
2201 * worker CPUs. Due to DMA, bus arbitration, cache locality,
2202 * contention, SMI, pipelining etc. there is no guaranteed way of
2203 * doing this on x86 CPUs.
2204 *
2205 * GIP_TSC_DELTA_METHOD_1:
2206 * We ignore the first few runs of the loop in order to prime the
2207 * cache. Also, we need to be careful about using 'pause' instruction
2208 * in critical busy-wait loops in this code - it can cause undesired
2209 * behaviour with hyperthreading.
2210 *
2211 * We try to minimize the measurement error by computing the minimum
2212 * read time of the compare statement in the worker by taking TSC
2213 * measurements across it.
2214 *
2215 * It must be noted that the computed minimum read time is mostly to
2216 * eliminate huge deltas when the worker is too early and doesn't by
2217 * itself help produce more accurate deltas. We allow two times the
2218 * computed minimum as an arbibtrary acceptable threshold. Therefore,
2219 * it is still possible to get negative deltas where there are none
2220 * when the worker is earlier. As long as these occasional negative
2221 * deltas are lower than the time it takes to exit guest-context and
2222 * the OS to reschedule EMT on a different CPU we won't expose a TSC
2223 * that jumped backwards. It is because of the existence of the
2224 * negative deltas we don't recompute the delta with the master and
2225 * worker interchanged to eliminate the remaining measurement error.
2226 *
2227 * For GIP_TSC_DELTA_METHOD_2, see supdrvTscDeltaMethod2CollectData.
2228 */
2229static DECLCALLBACK(void) supdrvMeasureTscDeltaCallback(RTCPUID idCpu, void *pvUser1, void *pvUser2)
2230{
2231 PSUPDRVGIPTSCDELTARGS pArgs = (PSUPDRVGIPTSCDELTARGS)pvUser1;
2232 PSUPDRVDEVEXT pDevExt = pArgs->pDevExt;
2233 PSUPGIPCPU pGipCpuWorker = pArgs->pWorker;
2234 PSUPGIPCPU pGipCpuMaster = pArgs->pMaster;
2235 RTCPUID idMaster = pArgs->idMaster;
2236 int cTriesLeft;
2237
2238 /* A bit of paranoia first. */
2239 if (!pGipCpuMaster || !pGipCpuWorker)
2240 return;
2241
2242 /* If the CPU isn't part of the measurement, return immediately. */
2243 if ( idCpu != idMaster
2244 && idCpu != pGipCpuWorker->idCpu)
2245 return;
2246
2247 /* If the IPRT API isn't concurrent safe, the master and worker wait for each other
2248 with a timeout to avoid deadlocking the entire system. */
2249 if (!RTMpOnAllIsConcurrentSafe())
2250 {
2251 /** @todo This was introduced for Windows, but since Windows doesn't use this
2252 * code path any longer (as DPC timeouts BSOD regardless of interrupts,
2253 * see @bugref{6710} comment 81), eventually phase it out. */
2254 uint64_t uTscNow;
2255 uint64_t uTscStart;
2256 uint64_t const cWaitTicks = 130000; /* Arbitrary value, can be tweaked later. */
2257
2258 ASMSerializeInstruction();
2259 uTscStart = ASMReadTSC();
2260 if (idCpu == idMaster)
2261 {
2262 ASMAtomicWriteU32(&pDevExt->pTscDeltaSync->u, GIP_TSC_DELTA_SYNC_PRESTART_MASTER);
2263 while (ASMAtomicReadU32(&pDevExt->pTscDeltaSync->u) != GIP_TSC_DELTA_SYNC_PRESTART_WORKER)
2264 {
2265 ASMSerializeInstruction();
2266 uTscNow = ASMReadTSC();
2267 if (uTscNow - uTscStart > cWaitTicks)
2268 {
2269 /* Set the worker delta to indicate failure, not the master. */
2270 ASMAtomicWriteS64(&pGipCpuWorker->i64TSCDelta, INT64_MAX);
2271 return;
2272 }
2273
2274 ASMNopPause();
2275 }
2276 }
2277 else
2278 {
2279 while (ASMAtomicReadU32(&pDevExt->pTscDeltaSync->u) != GIP_TSC_DELTA_SYNC_PRESTART_MASTER)
2280 {
2281 ASMSerializeInstruction();
2282 uTscNow = ASMReadTSC();
2283 if (uTscNow - uTscStart > cWaitTicks)
2284 {
2285 ASMAtomicWriteS64(&pGipCpuWorker->i64TSCDelta, INT64_MAX);
2286 return;
2287 }
2288
2289 ASMNopPause();
2290 }
2291 ASMAtomicWriteU32(&pDevExt->pTscDeltaSync->u, GIP_TSC_DELTA_SYNC_PRESTART_WORKER);
2292 }
2293 }
2294
2295 /*
2296 * ...
2297 */
2298 Assert(pGipCpuWorker->i64TSCDelta == INT64_MAX);
2299 cTriesLeft = 12;
2300 while (cTriesLeft-- > 0)
2301 {
2302 unsigned i;
2303 uint64_t uMinCmpReadTime = UINT64_MAX;
2304 for (i = 0; i < GIP_TSC_DELTA_LOOPS; i++)
2305 {
2306#ifdef GIP_TSC_DELTA_METHOD_2
2307 supdrvTscDeltaMethod2Looped(pArgs, idCpu, i);
2308#endif
2309 if (idCpu == idMaster)
2310 {
2311 /*
2312 * The master.
2313 */
2314 RTCCUINTREG uFlags;
2315 AssertMsg(pGipCpuMaster->u64TSCSample == GIP_TSC_DELTA_RSVD,
2316 ("%#llx idMaster=%#x idWorker=%#x (idGipMaster=%#x)\n",
2317 pGipCpuMaster->u64TSCSample, idMaster, pGipCpuWorker->idCpu, pDevExt->idGipMaster));
2318 ASMAtomicWriteU32(&pDevExt->pTscDeltaSync->u, GIP_TSC_DELTA_SYNC_START);
2319
2320 /* Disable interrupts only in the master for as short a period
2321 as possible, thanks again to Windows. See @bugref{6710} comment #73. */
2322 uFlags = ASMIntDisableFlags();
2323
2324 while (ASMAtomicReadU32(&pDevExt->pTscDeltaSync->u) == GIP_TSC_DELTA_SYNC_START)
2325 { /* nothing */ }
2326
2327#ifdef GIP_TSC_DELTA_METHOD_1
2328 do
2329 {
2330 ASMSerializeInstruction();
2331 ASMAtomicWriteU64(&pGipCpuMaster->u64TSCSample, ASMReadTSC());
2332 } while (pGipCpuMaster->u64TSCSample == GIP_TSC_DELTA_RSVD);
2333
2334#elif defined(GIP_TSC_DELTA_METHOD_2)
2335 supdrvTscDeltaMethod2CollectData(pArgs->pMasterData, &pArgs->pWorkerData->iCurSeqNo, pArgs->fLagMaster);
2336#else
2337# error "tsc delta method not selected"
2338#endif
2339
2340 /* Sync up with worker. */
2341 ASMSetFlags(uFlags);
2342
2343 while (ASMAtomicReadU32(&pDevExt->pTscDeltaSync->u) != GIP_TSC_DELTA_SYNC_WORKER_DONE)
2344 { /* nothing */ }
2345
2346 /* Process the data. */
2347 if (i > GIP_TSC_DELTA_PRIMER_LOOPS + GIP_TSC_DELTA_READ_TIME_LOOPS)
2348 {
2349#ifdef GIP_TSC_DELTA_METHOD_1
2350 if (pGipCpuWorker->u64TSCSample != GIP_TSC_DELTA_RSVD)
2351 {
2352 int64_t iDelta = pGipCpuWorker->u64TSCSample
2353 - (pGipCpuMaster->u64TSCSample - pGipCpuMaster->i64TSCDelta);
2354 if ( iDelta >= GIP_TSC_DELTA_INITIAL_MASTER_VALUE
2355 ? iDelta < pGipCpuWorker->i64TSCDelta
2356 : iDelta > pGipCpuWorker->i64TSCDelta || pGipCpuWorker->i64TSCDelta == INT64_MAX)
2357 pGipCpuWorker->i64TSCDelta = iDelta;
2358 }
2359#elif defined(GIP_TSC_DELTA_METHOD_2)
2360 if (i > GIP_TSC_DELTA_PRIMER_LOOPS)
2361 supdrvTscDeltaMethod2ProcessDataOnMaster(pArgs, i == GIP_TSC_DELTA_LOOPS - 1);
2362#else
2363# error "tsc delta method not selected"
2364#endif
2365 }
2366
2367 /* Reset our TSC sample and tell the worker to move on. */
2368 ASMAtomicWriteU64(&pGipCpuMaster->u64TSCSample, GIP_TSC_DELTA_RSVD);
2369 ASMAtomicWriteU32(&pDevExt->pTscDeltaSync->u, GIP_TSC_DELTA_SYNC_STOP);
2370 }
2371 else
2372 {
2373 /*
2374 * The worker.
2375 */
2376 uint64_t uTscWorker;
2377 uint64_t uTscWorkerFlushed;
2378 uint64_t uCmpReadTime;
2379
2380 ASMAtomicReadU64(&pGipCpuMaster->u64TSCSample); /* Warm the cache line. */
2381 while (ASMAtomicReadU32(&pDevExt->pTscDeltaSync->u) != GIP_TSC_DELTA_SYNC_START)
2382 { /* nothing */ }
2383 Assert(pGipCpuMaster->u64TSCSample == GIP_TSC_DELTA_RSVD);
2384 ASMAtomicWriteU32(&pDevExt->pTscDeltaSync->u, GIP_TSC_DELTA_SYNC_WORKER_READY);
2385
2386#ifdef GIP_TSC_DELTA_METHOD_1
2387 /*
2388 * Keep reading the TSC until we notice that the master has read his. Reading
2389 * the TSC -after- the master has updated the memory is way too late. We thus
2390 * compensate by trying to measure how long it took for the worker to notice
2391 * the memory flushed from the master.
2392 */
2393 do
2394 {
2395 ASMSerializeInstruction();
2396 uTscWorker = ASMReadTSC();
2397 } while (pGipCpuMaster->u64TSCSample == GIP_TSC_DELTA_RSVD);
2398 ASMSerializeInstruction();
2399 uTscWorkerFlushed = ASMReadTSC();
2400
2401 uCmpReadTime = uTscWorkerFlushed - uTscWorker;
2402 if (i > GIP_TSC_DELTA_PRIMER_LOOPS + GIP_TSC_DELTA_READ_TIME_LOOPS)
2403 {
2404 /* This is totally arbitrary a.k.a I don't like it but I have no better ideas for now. */
2405 if (uCmpReadTime < (uMinCmpReadTime << 1))
2406 {
2407 ASMAtomicWriteU64(&pGipCpuWorker->u64TSCSample, uTscWorker);
2408 if (uCmpReadTime < uMinCmpReadTime)
2409 uMinCmpReadTime = uCmpReadTime;
2410 }
2411 else
2412 ASMAtomicWriteU64(&pGipCpuWorker->u64TSCSample, GIP_TSC_DELTA_RSVD);
2413 }
2414 else if (i > GIP_TSC_DELTA_PRIMER_LOOPS)
2415 {
2416 if (uCmpReadTime < uMinCmpReadTime)
2417 uMinCmpReadTime = uCmpReadTime;
2418 }
2419
2420#elif defined(GIP_TSC_DELTA_METHOD_2)
2421 supdrvTscDeltaMethod2CollectData(pArgs->pWorkerData, &pArgs->pMasterData->iCurSeqNo, pArgs->fLagWorker);
2422#else
2423# error "tsc delta method not selected"
2424#endif
2425
2426 /* Tell master we're done collecting our data. */
2427 ASMAtomicWriteU32(&pDevExt->pTscDeltaSync->u, GIP_TSC_DELTA_SYNC_WORKER_DONE);
2428
2429 /* Wait for the master to process the data. */
2430 while (ASMAtomicReadU32(&pDevExt->pTscDeltaSync->u) == GIP_TSC_DELTA_SYNC_WORKER_DONE)
2431 ASMNopPause();
2432 }
2433 }
2434
2435 /*
2436 * We must reset the worker TSC sample value in case it gets picked as a
2437 * GIP master later on (it's trashed above, naturally).
2438 */
2439 if (idCpu == idMaster)
2440 ASMAtomicWriteU64(&pGipCpuWorker->u64TSCSample, GIP_TSC_DELTA_RSVD);
2441
2442 /*
2443 * Success? If so, stop trying.
2444 */
2445 if (pGipCpuWorker->i64TSCDelta != INT64_MAX)
2446 {
2447 if (idCpu == idMaster)
2448 {
2449 RTCpuSetDelByIndex(&pDevExt->TscDeltaCpuSet, pGipCpuMaster->iCpuSet);
2450 RTCpuSetAddByIndex(&pDevExt->TscDeltaObtainedCpuSet, pGipCpuMaster->iCpuSet);
2451 }
2452 else
2453 {
2454 RTCpuSetDelByIndex(&pDevExt->TscDeltaCpuSet, pGipCpuWorker->iCpuSet);
2455 RTCpuSetAddByIndex(&pDevExt->TscDeltaObtainedCpuSet, pGipCpuWorker->iCpuSet);
2456 }
2457 break;
2458 }
2459 }
2460}
2461
2462
2463/**
2464 * Clears TSC delta related variables.
2465 *
2466 * Clears all TSC samples as well as the delta synchronization variable on the
2467 * all the per-CPU structs. Optionally also clears the per-cpu deltas too.
2468 *
2469 * @param pDevExt Pointer to the device instance data.
2470 * @param fClearDeltas Whether the deltas are also to be cleared.
2471 */
2472DECLINLINE(void) supdrvClearTscSamples(PSUPDRVDEVEXT pDevExt, bool fClearDeltas)
2473{
2474 unsigned iCpu;
2475 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
2476 for (iCpu = 0; iCpu < pGip->cCpus; iCpu++)
2477 {
2478 PSUPGIPCPU pGipCpu = &pGip->aCPUs[iCpu];
2479 ASMAtomicWriteU64(&pGipCpu->u64TSCSample, GIP_TSC_DELTA_RSVD);
2480 if (fClearDeltas)
2481 ASMAtomicWriteS64(&pGipCpu->i64TSCDelta, INT64_MAX);
2482 }
2483 ASMAtomicWriteU32(&pDevExt->pTscDeltaSync->u, GIP_TSC_DELTA_SYNC_STOP);
2484}
2485
2486
2487/**
2488 * Measures the TSC delta between the master GIP CPU and one specified worker
2489 * CPU.
2490 *
2491 * @returns VBox status code.
2492 * @param pDevExt Pointer to the device instance data.
2493 * @param idxWorker The index of the worker CPU from the GIP's array of
2494 * CPUs.
2495 *
2496 * @remarks This must be called with preemption enabled!
2497 */
2498static int supdrvMeasureTscDeltaOne(PSUPDRVDEVEXT pDevExt, uint32_t idxWorker)
2499{
2500 int rc;
2501 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
2502 RTCPUID idMaster = pDevExt->idGipMaster;
2503 PSUPGIPCPU pGipCpuWorker = &pGip->aCPUs[idxWorker];
2504 PSUPGIPCPU pGipCpuMaster;
2505 uint32_t iGipCpuMaster;
2506
2507 /* Validate input a bit. */
2508 AssertReturn(pGip, VERR_INVALID_PARAMETER);
2509 Assert(pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED);
2510 Assert(RTThreadPreemptIsEnabled(NIL_RTTHREAD));
2511
2512 /*
2513 * Don't attempt measuring the delta for the GIP master.
2514 */
2515 if (pGipCpuWorker->idCpu == idMaster)
2516 {
2517 if (pGipCpuWorker->i64TSCDelta == INT64_MAX) /* This shouldn't happen, but just in case. */
2518 ASMAtomicWriteS64(&pGipCpuWorker->i64TSCDelta, GIP_TSC_DELTA_INITIAL_MASTER_VALUE);
2519 return VINF_SUCCESS;
2520 }
2521
2522 /*
2523 * If the CPU has hyper-threading and the APIC IDs of the master and worker are adjacent,
2524 * try pick a different master. (This fudge only works with multi core systems.)
2525 * ASSUMES related threads have adjacent APIC IDs. ASSUMES two threads per core.
2526 */
2527 iGipCpuMaster = supdrvGipFindCpuIndexForCpuId(pGip, idMaster);
2528 AssertReturn(iGipCpuMaster < pGip->cCpus, VERR_INVALID_CPU_ID);
2529 pGipCpuMaster = &pGip->aCPUs[iGipCpuMaster];
2530 if ( (pGipCpuMaster->idApic & ~1) == (pGipCpuWorker->idApic & ~1)
2531 && ASMHasCpuId()
2532 && ASMIsValidStdRange(ASMCpuId_EAX(0))
2533 && (ASMCpuId_EDX(1) & X86_CPUID_FEATURE_EDX_HTT)
2534 && pGip->cOnlineCpus > 2)
2535 {
2536 uint32_t i;
2537 for (i = 0; i < pGip->cCpus; i++)
2538 if ( i != iGipCpuMaster
2539 && i != idxWorker
2540 && pGip->aCPUs[i].enmState == SUPGIPCPUSTATE_ONLINE
2541 && pGip->aCPUs[i].i64TSCDelta != INT64_MAX
2542 && pGip->aCPUs[i].idCpu != NIL_RTCPUID
2543 && pGip->aCPUs[i].idCpu != idMaster /* paranoia starts here... */
2544 && pGip->aCPUs[i].idCpu != pGipCpuWorker->idCpu
2545 && pGip->aCPUs[i].idApic != pGipCpuWorker->idApic
2546 && pGip->aCPUs[i].idApic != pGipCpuMaster->idApic)
2547 {
2548 iGipCpuMaster = i;
2549 pGipCpuMaster = &pGip->aCPUs[i];
2550 idMaster = pGipCpuMaster->idCpu;
2551 break;
2552 }
2553 }
2554
2555 /*
2556 * Set the master TSC as the initiator. This serializes delta measurments.
2557 */
2558 while (!ASMAtomicCmpXchgU32(&pDevExt->idTscDeltaInitiator, idMaster, NIL_RTCPUID))
2559 {
2560 /*
2561 * Sleep here rather than spin as there is a parallel measurement
2562 * being executed and that can take a good while to be done.
2563 */
2564 RTThreadSleep(1);
2565 }
2566
2567 if (RTCpuSetIsMemberByIndex(&pGip->OnlineCpuSet, pGipCpuWorker->iCpuSet))
2568 {
2569 /*
2570 * Initialize data package for the RTMpOnAll callback.
2571 */
2572 SUPDRVGIPTSCDELTARGS Args;
2573 RT_ZERO(Args);
2574 Args.pWorker = pGipCpuWorker;
2575 Args.pMaster = pGipCpuMaster;
2576 Args.idMaster = idMaster;
2577 Args.pDevExt = pDevExt;
2578#ifdef GIP_TSC_DELTA_METHOD_1
2579 rc = VINF_SUCCESS;
2580#elif defined(GIP_TSC_DELTA_METHOD_2)
2581 rc = supdrvTscDeltaMethod2Init(&Args);
2582#else
2583# error "huh?"
2584#endif
2585 if (RT_SUCCESS(rc))
2586 {
2587 /*
2588 * Fire TSC-read workers on all CPUs but only synchronize between master
2589 * and one worker to ease memory contention.
2590 */
2591 ASMAtomicWriteS64(&pGipCpuWorker->i64TSCDelta, INT64_MAX);
2592 ASMAtomicWriteU32(&pDevExt->pTscDeltaSync->u, GIP_TSC_DELTA_SYNC_STOP);
2593
2594 rc = RTMpOnAll(supdrvMeasureTscDeltaCallback, &Args, NULL);
2595 if (RT_SUCCESS(rc))
2596 {
2597 if (RT_LIKELY(pGipCpuWorker->i64TSCDelta != INT64_MAX))
2598 {
2599 /*
2600 * Work the TSC delta applicability rating. It starts
2601 * optimistic in supdrvGipInit, we downgrade it here.
2602 */
2603 SUPGIPUSETSCDELTA enmRating;
2604 if ( pGipCpuWorker->i64TSCDelta > GIP_TSC_DELTA_THRESHOLD_ROUGHLY_ZERO
2605 || pGipCpuWorker->i64TSCDelta < -GIP_TSC_DELTA_THRESHOLD_ROUGHLY_ZERO)
2606 enmRating = SUPGIPUSETSCDELTA_NOT_ZERO;
2607 else if ( pGipCpuWorker->i64TSCDelta > GIP_TSC_DELTA_THRESHOLD_PRACTICALLY_ZERO
2608 || pGipCpuWorker->i64TSCDelta < -GIP_TSC_DELTA_THRESHOLD_PRACTICALLY_ZERO)
2609 enmRating = SUPGIPUSETSCDELTA_ROUGHLY_ZERO;
2610 else
2611 enmRating = SUPGIPUSETSCDELTA_PRACTICALLY_ZERO;
2612 if (pGip->enmUseTscDelta < enmRating)
2613 {
2614 AssertCompile(sizeof(pGip->enmUseTscDelta) == sizeof(uint32_t));
2615 ASMAtomicWriteU32((uint32_t volatile *)&pGip->enmUseTscDelta, enmRating);
2616 }
2617 }
2618 else
2619 rc = VERR_SUPDRV_TSC_DELTA_MEASUREMENT_FAILED;
2620 }
2621 }
2622
2623#ifdef GIP_TSC_DELTA_METHOD_2
2624 supdrvTscDeltaMethod2Term(&Args);
2625#endif
2626 }
2627 else
2628 rc = VERR_CPU_OFFLINE;
2629
2630 ASMAtomicWriteU32(&pDevExt->idTscDeltaInitiator, NIL_RTCPUID);
2631 return rc;
2632}
2633
2634
2635/**
2636 * Performs the initial measurements of the TSC deltas between CPUs.
2637 *
2638 * This is called by supdrvGipCreate or triggered by it if threaded.
2639 *
2640 * @returns VBox status code.
2641 * @param pDevExt Pointer to the device instance data.
2642 *
2643 * @remarks Must be called only after supdrvGipInitOnCpu() as this function uses
2644 * idCpu, GIP's online CPU set which are populated in
2645 * supdrvGipInitOnCpu().
2646 */
2647static int supdrvMeasureInitialTscDeltas(PSUPDRVDEVEXT pDevExt)
2648{
2649 PSUPGIPCPU pGipCpuMaster;
2650 unsigned iCpu;
2651 unsigned iOddEven;
2652 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
2653 uint32_t idxMaster = UINT32_MAX;
2654 int rc = VINF_SUCCESS;
2655 uint32_t cMpOnOffEvents = ASMAtomicReadU32(&pDevExt->cMpOnOffEvents);
2656
2657 Assert(pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED);
2658
2659 /*
2660 * Pick the first CPU online as the master TSC and make it the new GIP master based
2661 * on the APIC ID.
2662 *
2663 * Technically we can simply use "idGipMaster" but doing this gives us master as CPU 0
2664 * in most cases making it nicer/easier for comparisons. It is safe to update the GIP
2665 * master as this point since the sync/async timer isn't created yet.
2666 */
2667 supdrvClearTscSamples(pDevExt, true /* fClearDeltas */);
2668 for (iCpu = 0; iCpu < RT_ELEMENTS(pGip->aiCpuFromApicId); iCpu++)
2669 {
2670 uint16_t idxCpu = pGip->aiCpuFromApicId[iCpu];
2671 if (idxCpu != UINT16_MAX)
2672 {
2673 PSUPGIPCPU pGipCpu = &pGip->aCPUs[idxCpu];
2674 if (RTCpuSetIsMemberByIndex(&pGip->OnlineCpuSet, pGipCpu->iCpuSet))
2675 {
2676 idxMaster = idxCpu;
2677 pGipCpu->i64TSCDelta = GIP_TSC_DELTA_INITIAL_MASTER_VALUE;
2678 break;
2679 }
2680 }
2681 }
2682 AssertReturn(idxMaster != UINT32_MAX, VERR_CPU_NOT_FOUND);
2683 pGipCpuMaster = &pGip->aCPUs[idxMaster];
2684 ASMAtomicWriteSize(&pDevExt->idGipMaster, pGipCpuMaster->idCpu);
2685
2686 /*
2687 * If there is only a single CPU online we have nothing to do.
2688 */
2689 if (pGip->cOnlineCpus <= 1)
2690 {
2691 AssertReturn(pGip->cOnlineCpus > 0, VERR_INTERNAL_ERROR_5);
2692 return VINF_SUCCESS;
2693 }
2694
2695 /*
2696 * Loop thru the GIP CPU array and get deltas for each CPU (except the
2697 * master). We do the CPUs with the even numbered APIC IDs first so that
2698 * we've got alternative master CPUs to pick from on hyper-threaded systems.
2699 */
2700 for (iOddEven = 0; iOddEven < 2; iOddEven++)
2701 {
2702 for (iCpu = 0; iCpu < pGip->cCpus; iCpu++)
2703 {
2704 PSUPGIPCPU pGipCpuWorker = &pGip->aCPUs[iCpu];
2705 if ( iCpu != idxMaster
2706 && (iOddEven > 0 || (pGipCpuWorker->idApic & 1) == 0)
2707 && RTCpuSetIsMemberByIndex(&pDevExt->TscDeltaCpuSet, pGipCpuWorker->iCpuSet))
2708 {
2709 rc = supdrvMeasureTscDeltaOne(pDevExt, iCpu);
2710 if (RT_FAILURE(rc))
2711 {
2712 SUPR0Printf("supdrvMeasureTscDeltaOne failed. rc=%d CPU[%u].idCpu=%u Master[%u].idCpu=%u\n", rc, iCpu,
2713 pGipCpuWorker->idCpu, idxMaster, pDevExt->idGipMaster, pGipCpuMaster->idCpu);
2714 break;
2715 }
2716
2717 if (ASMAtomicReadU32(&pDevExt->cMpOnOffEvents) != cMpOnOffEvents)
2718 {
2719 SUPR0Printf("One or more CPUs transitioned between online & offline states. I'm confused, retry...\n");
2720 rc = VERR_TRY_AGAIN;
2721 break;
2722 }
2723 }
2724 }
2725 }
2726
2727 return rc;
2728}
2729
2730
2731/**
2732 * Callback used by supdrvDetermineAsyncTSC to read the TSC on a CPU.
2733 *
2734 * @param idCpu Ignored.
2735 * @param pvUser1 Where to put the TSC.
2736 * @param pvUser2 Ignored.
2737 */
2738static DECLCALLBACK(void) supdrvDetermineAsyncTscWorker(RTCPUID idCpu, void *pvUser1, void *pvUser2)
2739{
2740 ASMAtomicWriteU64((uint64_t volatile *)pvUser1, ASMReadTSC());
2741}
2742
2743
2744/**
2745 * Determine if Async GIP mode is required because of TSC drift.
2746 *
2747 * When using the default/normal timer code it is essential that the time stamp counter
2748 * (TSC) runs never backwards, that is, a read operation to the counter should return
2749 * a bigger value than any previous read operation. This is guaranteed by the latest
2750 * AMD CPUs and by newer Intel CPUs which never enter the C2 state (P4). In any other
2751 * case we have to choose the asynchronous timer mode.
2752 *
2753 * @param poffMin Pointer to the determined difference between different
2754 * cores (optional, can be NULL).
2755 * @return false if the time stamp counters appear to be synchronized, true otherwise.
2756 */
2757static bool supdrvDetermineAsyncTsc(uint64_t *poffMin)
2758{
2759 /*
2760 * Just iterate all the cpus 8 times and make sure that the TSC is
2761 * ever increasing. We don't bother taking TSC rollover into account.
2762 */
2763 int iEndCpu = RTMpGetArraySize();
2764 int iCpu;
2765 int cLoops = 8;
2766 bool fAsync = false;
2767 int rc = VINF_SUCCESS;
2768 uint64_t offMax = 0;
2769 uint64_t offMin = ~(uint64_t)0;
2770 uint64_t PrevTsc = ASMReadTSC();
2771
2772 while (cLoops-- > 0)
2773 {
2774 for (iCpu = 0; iCpu < iEndCpu; iCpu++)
2775 {
2776 uint64_t CurTsc;
2777 rc = RTMpOnSpecific(RTMpCpuIdFromSetIndex(iCpu), supdrvDetermineAsyncTscWorker, &CurTsc, NULL);
2778 if (RT_SUCCESS(rc))
2779 {
2780 if (CurTsc <= PrevTsc)
2781 {
2782 fAsync = true;
2783 offMin = offMax = PrevTsc - CurTsc;
2784 Log(("supdrvDetermineAsyncTsc: iCpu=%d cLoops=%d CurTsc=%llx PrevTsc=%llx\n",
2785 iCpu, cLoops, CurTsc, PrevTsc));
2786 break;
2787 }
2788
2789 /* Gather statistics (except the first time). */
2790 if (iCpu != 0 || cLoops != 7)
2791 {
2792 uint64_t off = CurTsc - PrevTsc;
2793 if (off < offMin)
2794 offMin = off;
2795 if (off > offMax)
2796 offMax = off;
2797 Log2(("%d/%d: off=%llx\n", cLoops, iCpu, off));
2798 }
2799
2800 /* Next */
2801 PrevTsc = CurTsc;
2802 }
2803 else if (rc == VERR_NOT_SUPPORTED)
2804 break;
2805 else
2806 AssertMsg(rc == VERR_CPU_NOT_FOUND || rc == VERR_CPU_OFFLINE, ("%d\n", rc));
2807 }
2808
2809 /* broke out of the loop. */
2810 if (iCpu < iEndCpu)
2811 break;
2812 }
2813
2814 if (poffMin)
2815 *poffMin = offMin; /* Almost RTMpOnSpecific profiling. */
2816 Log(("supdrvDetermineAsyncTsc: returns %d; iEndCpu=%d rc=%d offMin=%llx offMax=%llx\n",
2817 fAsync, iEndCpu, rc, offMin, offMax));
2818#if !defined(RT_OS_SOLARIS) && !defined(RT_OS_OS2) && !defined(RT_OS_WINDOWS)
2819 OSDBGPRINT(("vboxdrv: fAsync=%d offMin=%#lx offMax=%#lx\n", fAsync, (long)offMin, (long)offMax));
2820#endif
2821 return fAsync;
2822}
2823
2824
2825/**
2826 * supdrvGipInit() worker that determines the GIP TSC mode.
2827 *
2828 * @returns The most suitable TSC mode.
2829 * @param pDevExt Pointer to the device instance data.
2830 */
2831static SUPGIPMODE supdrvGipInitDetermineTscMode(PSUPDRVDEVEXT pDevExt)
2832{
2833 uint64_t u64DiffCoresIgnored;
2834 uint32_t uEAX, uEBX, uECX, uEDX;
2835
2836 /*
2837 * Establish whether the CPU advertises TSC as invariant, we need that in
2838 * a couple of places below.
2839 */
2840 bool fInvariantTsc = false;
2841 if (ASMHasCpuId())
2842 {
2843 uEAX = ASMCpuId_EAX(0x80000000);
2844 if (ASMIsValidExtRange(uEAX) && uEAX >= 0x80000007)
2845 {
2846 uEDX = ASMCpuId_EDX(0x80000007);
2847 if (uEDX & X86_CPUID_AMD_ADVPOWER_EDX_TSCINVAR)
2848 fInvariantTsc = true;
2849 }
2850 }
2851
2852 /*
2853 * On single CPU systems, we don't need to consider ASYNC mode.
2854 */
2855 if (RTMpGetCount() <= 1)
2856 return fInvariantTsc ? SUPGIPMODE_INVARIANT_TSC : SUPGIPMODE_SYNC_TSC;
2857
2858 /*
2859 * Allow the user and/or OS specific bits to force async mode.
2860 */
2861 if (supdrvOSGetForcedAsyncTscMode(pDevExt))
2862 return SUPGIPMODE_ASYNC_TSC;
2863
2864
2865#if 0 /** @todo enable this when i64TscDelta is applied in all places where it's needed */
2866 /*
2867 * Use invariant mode if the CPU says TSC is invariant.
2868 */
2869 if (fInvariantTsc)
2870 return SUPGIPMODE_INVARIANT_TSC;
2871#endif
2872
2873 /*
2874 * TSC is not invariant and we're on SMP, this presents two problems:
2875 *
2876 * (1) There might be a skew between the CPU, so that cpu0
2877 * returns a TSC that is slightly different from cpu1.
2878 * This screw may be due to (2), bad TSC initialization
2879 * or slightly different TSC rates.
2880 *
2881 * (2) Power management (and other things) may cause the TSC
2882 * to run at a non-constant speed, and cause the speed
2883 * to be different on the cpus. This will result in (1).
2884 *
2885 * If any of the above is detected, we will have to use ASYNC mode.
2886 */
2887
2888 /* (1). Try check for current differences between the cpus. */
2889 if (supdrvDetermineAsyncTsc(&u64DiffCoresIgnored))
2890 return SUPGIPMODE_ASYNC_TSC;
2891
2892#if 1 /** @todo remove once i64TscDelta is applied everywhere. Enable #if 0 above. */
2893 if (fInvariantTsc)
2894 return SUPGIPMODE_INVARIANT_TSC;
2895#endif
2896
2897 /* (2) If it's an AMD CPU with power management, we won't trust its TSC. */
2898 ASMCpuId(0, &uEAX, &uEBX, &uECX, &uEDX);
2899 if ( ASMIsValidStdRange(uEAX)
2900 && ASMIsAmdCpuEx(uEBX, uECX, uEDX))
2901 {
2902 /* Check for APM support. */
2903 uEAX = ASMCpuId_EAX(0x80000000);
2904 if (ASMIsValidExtRange(uEAX) && uEAX >= 0x80000007)
2905 {
2906 uEDX = ASMCpuId_EDX(0x80000007);
2907 if (uEDX & 0x3e) /* STC|TM|THERMTRIP|VID|FID. Ignore TS. */
2908 return SUPGIPMODE_ASYNC_TSC;
2909 }
2910 }
2911
2912 return SUPGIPMODE_SYNC_TSC;
2913}
2914
2915
2916/**
2917 * Initializes per-CPU GIP information.
2918 *
2919 * @param pDevExt Pointer to the device instance data.
2920 * @param pGip Pointer to the GIP.
2921 * @param pCpu Pointer to which GIP CPU to initalize.
2922 * @param u64NanoTS The current nanosecond timestamp.
2923 */
2924static void supdrvGipInitCpu(PSUPDRVDEVEXT pDevExt, PSUPGLOBALINFOPAGE pGip, PSUPGIPCPU pCpu, uint64_t u64NanoTS)
2925{
2926 /* !!! Warning !!! The GIP may not be linked to the device instance data at this point!
2927 which is why we have 2 separate parameters. Don't dereference pDevExt->pGip here. */
2928 pCpu->u32TransactionId = 2;
2929 pCpu->u64NanoTS = u64NanoTS;
2930 pCpu->u64TSC = ASMReadTSC();
2931 pCpu->u64TSCSample = GIP_TSC_DELTA_RSVD;
2932 pCpu->i64TSCDelta = pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED ? INT64_MAX : 0;
2933
2934 ASMAtomicWriteSize(&pCpu->enmState, SUPGIPCPUSTATE_INVALID);
2935 ASMAtomicWriteSize(&pCpu->idCpu, NIL_RTCPUID);
2936 ASMAtomicWriteS16(&pCpu->iCpuSet, -1);
2937 ASMAtomicWriteU16(&pCpu->idApic, UINT16_MAX);
2938
2939 /*
2940 * We don't know the following values until we've executed updates.
2941 * So, we'll just pretend it's a 4 GHz CPU and adjust the history it on
2942 * the 2nd timer callout.
2943 */
2944 pCpu->u64CpuHz = _4G + 1; /* tstGIP-2 depends on this. */
2945 pCpu->u32UpdateIntervalTSC
2946 = pCpu->au32TSCHistory[0]
2947 = pCpu->au32TSCHistory[1]
2948 = pCpu->au32TSCHistory[2]
2949 = pCpu->au32TSCHistory[3]
2950 = pCpu->au32TSCHistory[4]
2951 = pCpu->au32TSCHistory[5]
2952 = pCpu->au32TSCHistory[6]
2953 = pCpu->au32TSCHistory[7]
2954 = (uint32_t)(_4G / pGip->u32UpdateHz);
2955}
2956
2957
2958/**
2959 * Initializes the GIP data.
2960 *
2961 * @param pDevExt Pointer to the device instance data.
2962 * @param pGip Pointer to the read-write kernel mapping of the GIP.
2963 * @param HCPhys The physical address of the GIP.
2964 * @param u64NanoTS The current nanosecond timestamp.
2965 * @param uUpdateHz The update frequency.
2966 * @param uUpdateIntervalNS The update interval in nanoseconds.
2967 * @param cCpus The CPU count.
2968 */
2969static void supdrvGipInit(PSUPDRVDEVEXT pDevExt, PSUPGLOBALINFOPAGE pGip, RTHCPHYS HCPhys,
2970 uint64_t u64NanoTS, unsigned uUpdateHz, unsigned uUpdateIntervalNS, unsigned cCpus)
2971{
2972 size_t const cbGip = RT_ALIGN_Z(RT_OFFSETOF(SUPGLOBALINFOPAGE, aCPUs[cCpus]), PAGE_SIZE);
2973 unsigned i;
2974#ifdef DEBUG_DARWIN_GIP
2975 OSDBGPRINT(("supdrvGipInit: pGip=%p HCPhys=%lx u64NanoTS=%llu uUpdateHz=%d cCpus=%u\n", pGip, (long)HCPhys, u64NanoTS, uUpdateHz, cCpus));
2976#else
2977 LogFlow(("supdrvGipInit: pGip=%p HCPhys=%lx u64NanoTS=%llu uUpdateHz=%d cCpus=%u\n", pGip, (long)HCPhys, u64NanoTS, uUpdateHz, cCpus));
2978#endif
2979
2980 /*
2981 * Initialize the structure.
2982 */
2983 memset(pGip, 0, cbGip);
2984
2985 pGip->u32Magic = SUPGLOBALINFOPAGE_MAGIC;
2986 pGip->u32Version = SUPGLOBALINFOPAGE_VERSION;
2987 pGip->u32Mode = supdrvGipInitDetermineTscMode(pDevExt);
2988 if ( pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC
2989 /*|| pGip->u32Mode == SUPGIPMODE_SYNC_TSC */)
2990 pGip->enmUseTscDelta = supdrvOSAreTscDeltasInSync() /* Allow OS override (windows). */
2991 ? SUPGIPUSETSCDELTA_ZERO_CLAIMED : SUPGIPUSETSCDELTA_PRACTICALLY_ZERO /* downgrade later */;
2992 else
2993 pGip->enmUseTscDelta = SUPGIPUSETSCDELTA_NOT_APPLICABLE;
2994 pGip->cCpus = (uint16_t)cCpus;
2995 pGip->cPages = (uint16_t)(cbGip / PAGE_SIZE);
2996 pGip->u32UpdateHz = uUpdateHz;
2997 pGip->u32UpdateIntervalNS = uUpdateIntervalNS;
2998 pGip->fGetGipCpu = SUPGIPGETCPU_APIC_ID;
2999 RTCpuSetEmpty(&pGip->OnlineCpuSet);
3000 RTCpuSetEmpty(&pGip->PresentCpuSet);
3001 RTMpGetSet(&pGip->PossibleCpuSet);
3002 pGip->cOnlineCpus = RTMpGetOnlineCount();
3003 pGip->cPresentCpus = RTMpGetPresentCount();
3004 pGip->cPossibleCpus = RTMpGetCount();
3005 pGip->idCpuMax = RTMpGetMaxCpuId();
3006 for (i = 0; i < RT_ELEMENTS(pGip->aiCpuFromApicId); i++)
3007 pGip->aiCpuFromApicId[i] = UINT16_MAX;
3008 for (i = 0; i < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx); i++)
3009 pGip->aiCpuFromCpuSetIdx[i] = UINT16_MAX;
3010 for (i = 0; i < cCpus; i++)
3011 supdrvGipInitCpu(pDevExt, pGip, &pGip->aCPUs[i], u64NanoTS);
3012
3013 /*
3014 * Link it to the device extension.
3015 */
3016 pDevExt->pGip = pGip;
3017 pDevExt->HCPhysGip = HCPhys;
3018 pDevExt->cGipUsers = 0;
3019}
3020
3021
3022/**
3023 * On CPU initialization callback for RTMpOnAll.
3024 *
3025 * @param idCpu The CPU ID.
3026 * @param pvUser1 The device extension.
3027 * @param pvUser2 The GIP.
3028 */
3029static DECLCALLBACK(void) supdrvGipInitOnCpu(RTCPUID idCpu, void *pvUser1, void *pvUser2)
3030{
3031 /* This is good enough, even though it will update some of the globals a
3032 bit to much. */
3033 supdrvGipMpEventOnline((PSUPDRVDEVEXT)pvUser1, idCpu);
3034}
3035
3036
3037/**
3038 * Invalidates the GIP data upon termination.
3039 *
3040 * @param pGip Pointer to the read-write kernel mapping of the GIP.
3041 */
3042static void supdrvGipTerm(PSUPGLOBALINFOPAGE pGip)
3043{
3044 unsigned i;
3045 pGip->u32Magic = 0;
3046 for (i = 0; i < pGip->cCpus; i++)
3047 {
3048 pGip->aCPUs[i].u64NanoTS = 0;
3049 pGip->aCPUs[i].u64TSC = 0;
3050 pGip->aCPUs[i].iTSCHistoryHead = 0;
3051 pGip->aCPUs[i].u64TSCSample = 0;
3052 pGip->aCPUs[i].i64TSCDelta = INT64_MAX;
3053 }
3054}
3055
3056
3057/**
3058 * Worker routine for supdrvGipUpdate() and supdrvGipUpdatePerCpu() that
3059 * updates all the per cpu data except the transaction id.
3060 *
3061 * @param pDevExt The device extension.
3062 * @param pGipCpu Pointer to the per cpu data.
3063 * @param u64NanoTS The current time stamp.
3064 * @param u64TSC The current TSC.
3065 * @param iTick The current timer tick.
3066 *
3067 * @remarks Can be called with interrupts disabled!
3068 */
3069static void supdrvGipDoUpdateCpu(PSUPDRVDEVEXT pDevExt, PSUPGIPCPU pGipCpu, uint64_t u64NanoTS, uint64_t u64TSC, uint64_t iTick)
3070{
3071 uint64_t u64TSCDelta;
3072 uint32_t u32UpdateIntervalTSC;
3073 uint32_t u32UpdateIntervalTSCSlack;
3074 unsigned iTSCHistoryHead;
3075 uint64_t u64CpuHz;
3076 uint32_t u32TransactionId;
3077
3078 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
3079 AssertPtrReturnVoid(pGip);
3080
3081 /* Delta between this and the previous update. */
3082 ASMAtomicUoWriteU32(&pGipCpu->u32PrevUpdateIntervalNS, (uint32_t)(u64NanoTS - pGipCpu->u64NanoTS));
3083
3084 /*
3085 * Update the NanoTS.
3086 */
3087 ASMAtomicWriteU64(&pGipCpu->u64NanoTS, u64NanoTS);
3088
3089 /*
3090 * Calc TSC delta.
3091 */
3092 u64TSCDelta = u64TSC - pGipCpu->u64TSC;
3093 ASMAtomicWriteU64(&pGipCpu->u64TSC, u64TSC);
3094
3095 /* We don't need to keep realculating the frequency when it's invariant. */
3096 if (pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC)
3097 return;
3098
3099 if (u64TSCDelta >> 32)
3100 {
3101 u64TSCDelta = pGipCpu->u32UpdateIntervalTSC;
3102 pGipCpu->cErrors++;
3103 }
3104
3105 /*
3106 * On the 2nd and 3rd callout, reset the history with the current TSC
3107 * interval since the values entered by supdrvGipInit are totally off.
3108 * The interval on the 1st callout completely unreliable, the 2nd is a bit
3109 * better, while the 3rd should be most reliable.
3110 */
3111 u32TransactionId = pGipCpu->u32TransactionId;
3112 if (RT_UNLIKELY( ( u32TransactionId == 5
3113 || u32TransactionId == 7)
3114 && ( iTick == 2
3115 || iTick == 3) ))
3116 {
3117 unsigned i;
3118 for (i = 0; i < RT_ELEMENTS(pGipCpu->au32TSCHistory); i++)
3119 ASMAtomicUoWriteU32(&pGipCpu->au32TSCHistory[i], (uint32_t)u64TSCDelta);
3120 }
3121
3122 /*
3123 * Validate the NanoTS deltas between timer fires with an arbitrary threshold of 0.5%.
3124 * Wait until we have at least one full history since the above history reset. The
3125 * assumption is that the majority of the previous history values will be tolerable.
3126 * See @bugref{6710} comment #67.
3127 */
3128 if ( u32TransactionId > 23 /* 7 + (8 * 2) */
3129 && pGip->u32Mode != SUPGIPMODE_ASYNC_TSC)
3130 {
3131 uint32_t uNanoTsThreshold = pGip->u32UpdateIntervalNS / 200;
3132 if ( pGipCpu->u32PrevUpdateIntervalNS > pGip->u32UpdateIntervalNS + uNanoTsThreshold
3133 || pGipCpu->u32PrevUpdateIntervalNS < pGip->u32UpdateIntervalNS - uNanoTsThreshold)
3134 {
3135 uint32_t u32;
3136 u32 = pGipCpu->au32TSCHistory[0];
3137 u32 += pGipCpu->au32TSCHistory[1];
3138 u32 += pGipCpu->au32TSCHistory[2];
3139 u32 += pGipCpu->au32TSCHistory[3];
3140 u32 >>= 2;
3141 u64TSCDelta = pGipCpu->au32TSCHistory[4];
3142 u64TSCDelta += pGipCpu->au32TSCHistory[5];
3143 u64TSCDelta += pGipCpu->au32TSCHistory[6];
3144 u64TSCDelta += pGipCpu->au32TSCHistory[7];
3145 u64TSCDelta >>= 2;
3146 u64TSCDelta += u32;
3147 u64TSCDelta >>= 1;
3148 }
3149 }
3150
3151 /*
3152 * TSC History.
3153 */
3154 Assert(RT_ELEMENTS(pGipCpu->au32TSCHistory) == 8);
3155 iTSCHistoryHead = (pGipCpu->iTSCHistoryHead + 1) & 7;
3156 ASMAtomicWriteU32(&pGipCpu->iTSCHistoryHead, iTSCHistoryHead);
3157 ASMAtomicWriteU32(&pGipCpu->au32TSCHistory[iTSCHistoryHead], (uint32_t)u64TSCDelta);
3158
3159 /*
3160 * UpdateIntervalTSC = average of last 8,2,1 intervals depending on update HZ.
3161 *
3162 * On Windows, we have an occasional (but recurring) sour value that messed up
3163 * the history but taking only 1 interval reduces the precision overall.
3164 * However, this problem existed before the invariant mode was introduced.
3165 */
3166 if ( pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC
3167 || pGip->u32UpdateHz >= 1000)
3168 {
3169 uint32_t u32;
3170 u32 = pGipCpu->au32TSCHistory[0];
3171 u32 += pGipCpu->au32TSCHistory[1];
3172 u32 += pGipCpu->au32TSCHistory[2];
3173 u32 += pGipCpu->au32TSCHistory[3];
3174 u32 >>= 2;
3175 u32UpdateIntervalTSC = pGipCpu->au32TSCHistory[4];
3176 u32UpdateIntervalTSC += pGipCpu->au32TSCHistory[5];
3177 u32UpdateIntervalTSC += pGipCpu->au32TSCHistory[6];
3178 u32UpdateIntervalTSC += pGipCpu->au32TSCHistory[7];
3179 u32UpdateIntervalTSC >>= 2;
3180 u32UpdateIntervalTSC += u32;
3181 u32UpdateIntervalTSC >>= 1;
3182
3183 /* Value chosen for a 2GHz Athlon64 running linux 2.6.10/11. */
3184 u32UpdateIntervalTSCSlack = u32UpdateIntervalTSC >> 14;
3185 }
3186 else if (pGip->u32UpdateHz >= 90)
3187 {
3188 u32UpdateIntervalTSC = (uint32_t)u64TSCDelta;
3189 u32UpdateIntervalTSC += pGipCpu->au32TSCHistory[(iTSCHistoryHead - 1) & 7];
3190 u32UpdateIntervalTSC >>= 1;
3191
3192 /* value chosen on a 2GHz thinkpad running windows */
3193 u32UpdateIntervalTSCSlack = u32UpdateIntervalTSC >> 7;
3194 }
3195 else
3196 {
3197 u32UpdateIntervalTSC = (uint32_t)u64TSCDelta;
3198
3199 /* This value hasn't be checked yet.. waiting for OS/2 and 33Hz timers.. :-) */
3200 u32UpdateIntervalTSCSlack = u32UpdateIntervalTSC >> 6;
3201 }
3202 ASMAtomicWriteU32(&pGipCpu->u32UpdateIntervalTSC, u32UpdateIntervalTSC + u32UpdateIntervalTSCSlack);
3203
3204 /*
3205 * CpuHz.
3206 */
3207 u64CpuHz = ASMMult2xU32RetU64(u32UpdateIntervalTSC, RT_NS_1SEC);
3208 u64CpuHz /= pGip->u32UpdateIntervalNS;
3209 ASMAtomicWriteU64(&pGipCpu->u64CpuHz, u64CpuHz);
3210}
3211
3212
3213/**
3214 * Updates the GIP.
3215 *
3216 * @param pDevExt The device extension.
3217 * @param u64NanoTS The current nanosecond timesamp.
3218 * @param u64TSC The current TSC timesamp.
3219 * @param idCpu The CPU ID.
3220 * @param iTick The current timer tick.
3221 *
3222 * @remarks Can be called with interrupts disabled!
3223 */
3224static void supdrvGipUpdate(PSUPDRVDEVEXT pDevExt, uint64_t u64NanoTS, uint64_t u64TSC, RTCPUID idCpu, uint64_t iTick)
3225{
3226 /*
3227 * Determine the relevant CPU data.
3228 */
3229 PSUPGIPCPU pGipCpu;
3230 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
3231 AssertPtrReturnVoid(pGip);
3232
3233 if (pGip->u32Mode != SUPGIPMODE_ASYNC_TSC)
3234 pGipCpu = &pGip->aCPUs[0];
3235 else
3236 {
3237 unsigned iCpu = pGip->aiCpuFromApicId[ASMGetApicId()];
3238 if (RT_UNLIKELY(iCpu >= pGip->cCpus))
3239 return;
3240 pGipCpu = &pGip->aCPUs[iCpu];
3241 if (RT_UNLIKELY(pGipCpu->idCpu != idCpu))
3242 return;
3243 }
3244
3245 /*
3246 * Start update transaction.
3247 */
3248 if (!(ASMAtomicIncU32(&pGipCpu->u32TransactionId) & 1))
3249 {
3250 /* this can happen on win32 if we're taking to long and there are more CPUs around. shouldn't happen though. */
3251 AssertMsgFailed(("Invalid transaction id, %#x, not odd!\n", pGipCpu->u32TransactionId));
3252 ASMAtomicIncU32(&pGipCpu->u32TransactionId);
3253 pGipCpu->cErrors++;
3254 return;
3255 }
3256
3257 /*
3258 * Recalc the update frequency every 0x800th time.
3259 */
3260 if ( pGip->u32Mode != SUPGIPMODE_INVARIANT_TSC /* cuz we're not recalculating the frequency on invariants hosts. */
3261 && !(pGipCpu->u32TransactionId & (GIP_UPDATEHZ_RECALC_FREQ * 2 - 2)))
3262 {
3263 if (pGip->u64NanoTSLastUpdateHz)
3264 {
3265#ifdef RT_ARCH_AMD64 /** @todo fix 64-bit div here to work on x86 linux. */
3266 uint64_t u64Delta = u64NanoTS - pGip->u64NanoTSLastUpdateHz;
3267 uint32_t u32UpdateHz = (uint32_t)((RT_NS_1SEC_64 * GIP_UPDATEHZ_RECALC_FREQ) / u64Delta);
3268 if (u32UpdateHz <= 2000 && u32UpdateHz >= 30)
3269 {
3270 /** @todo r=ramshankar: Changing u32UpdateHz might screw up TSC frequency
3271 * calculation on non-invariant hosts if it changes the history decision
3272 * taken in supdrvGipDoUpdateCpu(). */
3273 uint64_t u64Interval = u64Delta / GIP_UPDATEHZ_RECALC_FREQ;
3274 ASMAtomicWriteU32(&pGip->u32UpdateHz, u32UpdateHz);
3275 ASMAtomicWriteU32(&pGip->u32UpdateIntervalNS, (uint32_t)u64Interval);
3276 }
3277#endif
3278 }
3279 ASMAtomicWriteU64(&pGip->u64NanoTSLastUpdateHz, u64NanoTS | 1);
3280 }
3281
3282 /*
3283 * Update the data.
3284 */
3285 supdrvGipDoUpdateCpu(pDevExt, pGipCpu, u64NanoTS, u64TSC, iTick);
3286
3287 /*
3288 * Complete transaction.
3289 */
3290 ASMAtomicIncU32(&pGipCpu->u32TransactionId);
3291}
3292
3293
3294/**
3295 * Updates the per cpu GIP data for the calling cpu.
3296 *
3297 * @param pDevExt The device extension.
3298 * @param u64NanoTS The current nanosecond timesamp.
3299 * @param u64TSC The current TSC timesamp.
3300 * @param idCpu The CPU ID.
3301 * @param idApic The APIC id for the CPU index.
3302 * @param iTick The current timer tick.
3303 *
3304 * @remarks Can be called with interrupts disabled!
3305 */
3306static void supdrvGipUpdatePerCpu(PSUPDRVDEVEXT pDevExt, uint64_t u64NanoTS, uint64_t u64TSC,
3307 RTCPUID idCpu, uint8_t idApic, uint64_t iTick)
3308{
3309 uint32_t iCpu;
3310 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
3311
3312 /*
3313 * Avoid a potential race when a CPU online notification doesn't fire on
3314 * the onlined CPU but the tick creeps in before the event notification is
3315 * run.
3316 */
3317 if (RT_UNLIKELY(iTick == 1))
3318 {
3319 iCpu = supdrvGipFindOrAllocCpuIndexForCpuId(pGip, idCpu);
3320 if (pGip->aCPUs[iCpu].enmState == SUPGIPCPUSTATE_OFFLINE)
3321 supdrvGipMpEventOnline(pDevExt, idCpu);
3322 }
3323
3324 iCpu = pGip->aiCpuFromApicId[idApic];
3325 if (RT_LIKELY(iCpu < pGip->cCpus))
3326 {
3327 PSUPGIPCPU pGipCpu = &pGip->aCPUs[iCpu];
3328 if (pGipCpu->idCpu == idCpu)
3329 {
3330 /*
3331 * Start update transaction.
3332 */
3333 if (!(ASMAtomicIncU32(&pGipCpu->u32TransactionId) & 1))
3334 {
3335 AssertMsgFailed(("Invalid transaction id, %#x, not odd!\n", pGipCpu->u32TransactionId));
3336 ASMAtomicIncU32(&pGipCpu->u32TransactionId);
3337 pGipCpu->cErrors++;
3338 return;
3339 }
3340
3341 /*
3342 * Update the data.
3343 */
3344 supdrvGipDoUpdateCpu(pDevExt, pGipCpu, u64NanoTS, u64TSC, iTick);
3345
3346 /*
3347 * Complete transaction.
3348 */
3349 ASMAtomicIncU32(&pGipCpu->u32TransactionId);
3350 }
3351 }
3352}
3353
3354
3355/**
3356 * Service a TSC-delta measurement request.
3357 *
3358 * @returns VBox status code.
3359 * @param pDevExt Pointer to the device instance data.
3360 * @param pSession The support driver session.
3361 * @param pReq Pointer to the TSC-delta measurement request.
3362 */
3363int VBOXCALL supdrvIOCtl_TscDeltaMeasure(PSUPDRVDEVEXT pDevExt, PSUPDRVSESSION pSession, PSUPTSCDELTAMEASURE pReq)
3364{
3365 PSUPGLOBALINFOPAGE pGip;
3366 RTCPUID idCpuWorker;
3367 int rc;
3368 int16_t cTries;
3369 RTMSINTERVAL cMsWaitRetry;
3370 uint16_t iCpu;
3371
3372 /*
3373 * Validate.
3374 */
3375 AssertPtr(pDevExt); AssertPtr(pSession); AssertPtr(pReq); /* paranoia^2 */
3376 if (pSession->GipMapObjR3 == NIL_RTR0MEMOBJ)
3377 return VERR_WRONG_ORDER;
3378 pGip = pDevExt->pGip;
3379 AssertReturn(pGip, VERR_INTERNAL_ERROR_2);
3380
3381 idCpuWorker = pReq->u.In.idCpu;
3382 if (idCpuWorker == NIL_RTCPUID)
3383 return VERR_INVALID_CPU_ID;
3384 cTries = RT_MAX(pReq->u.In.cRetries + 1, 10);
3385 cMsWaitRetry = RT_MAX(pReq->u.In.cMsWaitRetry, 5);
3386
3387 /*
3388 * The request is a noop if the TSC delta isn't being used.
3389 */
3390 pGip = pDevExt->pGip;
3391 if (pGip->enmUseTscDelta <= SUPGIPUSETSCDELTA_ZERO_CLAIMED)
3392 return VINF_SUCCESS;
3393
3394 rc = VERR_CPU_NOT_FOUND;
3395 for (iCpu = 0; iCpu < pGip->cCpus; iCpu++)
3396 {
3397 PSUPGIPCPU pGipCpuWorker = &pGip->aCPUs[iCpu];
3398 if (pGipCpuWorker->idCpu == idCpuWorker)
3399 {
3400 if ( pGipCpuWorker->i64TSCDelta != INT64_MAX
3401 && !pReq->u.In.fForce)
3402 return VINF_SUCCESS;
3403
3404#ifdef SUPDRV_USE_TSC_DELTA_THREAD
3405 if (pReq->u.In.fAsync)
3406 {
3407 /** @todo Async. doesn't implement options like retries, waiting. We'll need
3408 * to pass those options to the thread somehow and implement it in the
3409 * thread. Check if anyone uses/needs fAsync before implementing this. */
3410 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
3411 RTCpuSetAddByIndex(&pDevExt->TscDeltaCpuSet, pGipCpuWorker->iCpuSet);
3412 if ( pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_Listening
3413 || pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_Measuring)
3414 {
3415 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_WaitAndMeasure;
3416 }
3417 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
3418 RTThreadUserSignal(pDevExt->hTscDeltaThread);
3419 return VINF_SUCCESS;
3420 }
3421
3422 /*
3423 * If a TSC-delta measurement request is already being serviced by the thread,
3424 * wait 'cTries' times if a retry-timeout is provided, otherwise bail as busy.
3425 */
3426 while (cTries-- > 0)
3427 {
3428 SUPDRVTSCDELTATHREADSTATE enmState;
3429 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
3430 enmState = pDevExt->enmTscDeltaThreadState;
3431 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
3432
3433 if ( enmState == kTscDeltaThreadState_Measuring
3434 || enmState == kTscDeltaThreadState_WaitAndMeasure)
3435 {
3436 if ( !cTries
3437 || !cMsWaitRetry)
3438 return VERR_SUPDRV_TSC_DELTA_MEASUREMENT_BUSY;
3439 if (cMsWaitRetry)
3440 RTThreadSleep(cMsWaitRetry);
3441 }
3442 }
3443 cTries = RT_MAX(pReq->u.In.cRetries + 1, 10);
3444#endif
3445
3446 while (cTries-- > 0)
3447 {
3448 rc = supdrvMeasureTscDeltaOne(pDevExt, iCpu);
3449 if (RT_SUCCESS(rc))
3450 {
3451 Assert(pGipCpuWorker->i64TSCDelta != INT64_MAX);
3452 break;
3453 }
3454
3455 if (cMsWaitRetry)
3456 RTThreadSleep(cMsWaitRetry);
3457 }
3458
3459 break;
3460 }
3461 }
3462 return rc;
3463}
3464
3465
3466/**
3467 * Reads TSC with delta applied.
3468 *
3469 * Will try to resolve delta value INT64_MAX before applying it. This is the
3470 * main purpose of this function, to handle the case where the delta needs to be
3471 * determined.
3472 *
3473 * @returns VBox status code.
3474 * @param pDevExt Pointer to the device instance data.
3475 * @param pSession The support driver session.
3476 * @param pReq Pointer to the TSC-read request.
3477 */
3478int VBOXCALL supdrvIOCtl_TscRead(PSUPDRVDEVEXT pDevExt, PSUPDRVSESSION pSession, PSUPTSCREAD pReq)
3479{
3480 PSUPGLOBALINFOPAGE pGip;
3481 int rc;
3482
3483 /*
3484 * Validate. We require the client to have mapped GIP (no asserting on
3485 * ring-3 preconditions).
3486 */
3487 AssertPtr(pDevExt); AssertPtr(pReq); AssertPtr(pSession); /* paranoia^2 */
3488 if (pSession->GipMapObjR3 == NIL_RTR0MEMOBJ)
3489 return VERR_WRONG_ORDER;
3490 pGip = pDevExt->pGip;
3491 AssertReturn(pGip, VERR_INTERNAL_ERROR_2);
3492
3493 /*
3494 * We're usually here because we need to apply delta, but we shouldn't be
3495 * upset if the GIP is some different mode.
3496 */
3497 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
3498 {
3499 uint32_t cTries = 0;
3500 for (;;)
3501 {
3502 /*
3503 * Start by gathering the data, using CLI for disabling preemption
3504 * while we do that.
3505 */
3506 RTCCUINTREG uFlags = ASMIntDisableFlags();
3507 int iCpuSet = RTMpCpuIdToSetIndex(RTMpCpuId());
3508 int iGipCpu;
3509 if (RT_LIKELY( (unsigned)iCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)
3510 && (iGipCpu = pGip->aiCpuFromCpuSetIdx[iCpuSet]) < pGip->cCpus ))
3511 {
3512 int64_t i64Delta = pGip->aCPUs[iGipCpu].i64TSCDelta;
3513 pReq->u.Out.idApic = pGip->aCPUs[iGipCpu].idApic;
3514 pReq->u.Out.u64AdjustedTsc = ASMReadTSC();
3515 ASMSetFlags(uFlags);
3516
3517 /*
3518 * If we're lucky we've got a delta, but no predicitions here
3519 * as this I/O control is normally only used when the TSC delta
3520 * is set to INT64_MAX.
3521 */
3522 if (i64Delta != INT64_MAX)
3523 {
3524 pReq->u.Out.u64AdjustedTsc -= i64Delta;
3525 rc = VINF_SUCCESS;
3526 break;
3527 }
3528
3529 /* Give up after a few times. */
3530 if (cTries >= 4)
3531 {
3532 rc = VWRN_SUPDRV_TSC_DELTA_MEASUREMENT_FAILED;
3533 break;
3534 }
3535
3536 /* Need to measure the delta an try again. */
3537 rc = supdrvMeasureTscDeltaOne(pDevExt, iGipCpu);
3538 Assert(pGip->aCPUs[iGipCpu].i64TSCDelta != INT64_MAX || RT_FAILURE_NP(rc));
3539 }
3540 else
3541 {
3542 /* This really shouldn't happen. */
3543 AssertMsgFailed(("idCpu=%#x iCpuSet=%#x (%d)\n", RTMpCpuId(), iCpuSet, iCpuSet));
3544 pReq->u.Out.idApic = ASMGetApicId();
3545 pReq->u.Out.u64AdjustedTsc = ASMReadTSC();
3546 ASMSetFlags(uFlags);
3547 rc = VERR_INTERNAL_ERROR_5; /** @todo change to warning. */
3548 break;
3549 }
3550 }
3551 }
3552 else
3553 {
3554 /*
3555 * No delta to apply. Easy. Deal with preemption the lazy way.
3556 */
3557 RTCCUINTREG uFlags = ASMIntDisableFlags();
3558 int iCpuSet = RTMpCpuIdToSetIndex(RTMpCpuId());
3559 int iGipCpu;
3560 if (RT_LIKELY( (unsigned)iCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)
3561 && (iGipCpu = pGip->aiCpuFromCpuSetIdx[iCpuSet]) < pGip->cCpus ))
3562 pReq->u.Out.idApic = pGip->aCPUs[iGipCpu].idApic;
3563 else
3564 pReq->u.Out.idApic = ASMGetApicId();
3565 pReq->u.Out.u64AdjustedTsc = ASMReadTSC();
3566 ASMSetFlags(uFlags);
3567 rc = VINF_SUCCESS;
3568 }
3569
3570 return rc;
3571}
3572
Note: See TracBrowser for help on using the repository browser.

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette