VirtualBox

source: vbox/trunk/src/VBox/HostDrivers/Support/SUPDrvGip.cpp@ 54354

Last change on this file since 54354 was 54354, checked in by vboxsync, 10 years ago

nit

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 149.1 KB
Line 
1/* $Id: SUPDrvGip.cpp 54354 2015-02-22 01:46:35Z vboxsync $ */
2/** @file
3 * VBoxDrv - The VirtualBox Support Driver - Common code for GIP.
4 */
5
6/*
7 * Copyright (C) 2006-2015 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.215389.xyz. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * The contents of this file may alternatively be used under the terms
18 * of the Common Development and Distribution License Version 1.0
19 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20 * VirtualBox OSE distribution, in which case the provisions of the
21 * CDDL are applicable instead of those of the GPL.
22 *
23 * You may elect to license modified versions of this file under the
24 * terms and conditions of either the GPL or the CDDL or both.
25 */
26
27/*******************************************************************************
28* Header Files *
29*******************************************************************************/
30#define LOG_GROUP LOG_GROUP_SUP_DRV
31#define SUPDRV_AGNOSTIC
32#include "SUPDrvInternal.h"
33#ifndef PAGE_SHIFT
34# include <iprt/param.h>
35#endif
36#include <iprt/asm.h>
37#include <iprt/asm-amd64-x86.h>
38#include <iprt/asm-math.h>
39#include <iprt/cpuset.h>
40#include <iprt/handletable.h>
41#include <iprt/mem.h>
42#include <iprt/mp.h>
43#include <iprt/power.h>
44#include <iprt/process.h>
45#include <iprt/semaphore.h>
46#include <iprt/spinlock.h>
47#include <iprt/thread.h>
48#include <iprt/uuid.h>
49#include <iprt/net.h>
50#include <iprt/crc.h>
51#include <iprt/string.h>
52#include <iprt/timer.h>
53#if defined(RT_OS_DARWIN) || defined(RT_OS_SOLARIS) || defined(RT_OS_FREEBSD)
54# include <iprt/rand.h>
55# include <iprt/path.h>
56#endif
57#include <iprt/uint128.h>
58#include <iprt/x86.h>
59
60#include <VBox/param.h>
61#include <VBox/log.h>
62#include <VBox/err.h>
63
64#if defined(RT_OS_SOLARIS) || defined(RT_OS_DARWIN)
65# include "dtrace/SUPDrv.h"
66#else
67/* ... */
68#endif
69
70
71/*******************************************************************************
72* Defined Constants And Macros *
73*******************************************************************************/
74/** The frequency by which we recalculate the u32UpdateHz and
75 * u32UpdateIntervalNS GIP members. The value must be a power of 2.
76 *
77 * Warning: Bumping this too high might overflow u32UpdateIntervalNS.
78 */
79#define GIP_UPDATEHZ_RECALC_FREQ 0x800
80
81/** A reserved TSC value used for synchronization as well as measurement of
82 * TSC deltas. */
83#define GIP_TSC_DELTA_RSVD UINT64_MAX
84/** The number of TSC delta measurement loops in total (includes primer and
85 * read-time loops). */
86#define GIP_TSC_DELTA_LOOPS 96
87/** The number of cache primer loops. */
88#define GIP_TSC_DELTA_PRIMER_LOOPS 4
89/** The number of loops until we keep computing the minumum read time. */
90#define GIP_TSC_DELTA_READ_TIME_LOOPS 24
91
92/** @name Master / worker synchronization values.
93 * @{ */
94/** Stop measurement of TSC delta. */
95#define GIP_TSC_DELTA_SYNC_STOP UINT32_C(0)
96/** Start measurement of TSC delta. */
97#define GIP_TSC_DELTA_SYNC_START UINT32_C(1)
98/** Worker thread is ready for reading the TSC. */
99#define GIP_TSC_DELTA_SYNC_WORKER_READY UINT32_C(2)
100/** Worker thread is done updating TSC delta info. */
101#define GIP_TSC_DELTA_SYNC_WORKER_DONE UINT32_C(3)
102/** When IPRT is isn't concurrent safe: Master is ready and will wait for worker
103 * with a timeout. */
104#define GIP_TSC_DELTA_SYNC_PRESTART_MASTER UINT32_C(4)
105/** @} */
106
107/** When IPRT is isn't concurrent safe: Worker is ready after waiting for
108 * master with a timeout. */
109#define GIP_TSC_DELTA_SYNC_PRESTART_WORKER 5
110/** The TSC-refinement interval in seconds. */
111#define GIP_TSC_REFINE_PREIOD_IN_SECS 5
112/** The TSC-delta threshold for the SUPGIPUSETSCDELTA_PRACTICALLY_ZERO rating */
113#define GIP_TSC_DELTA_THRESHOLD_PRACTICALLY_ZERO 32
114/** The TSC-delta threshold for the SUPGIPUSETSCDELTA_ROUGHLY_ZERO rating */
115#define GIP_TSC_DELTA_THRESHOLD_ROUGHLY_ZERO 448
116/** The TSC delta value for the initial GIP master - 0 in regular builds.
117 * To test the delta code this can be set to a non-zero value. */
118#if 0
119# define GIP_TSC_DELTA_INITIAL_MASTER_VALUE INT64_C(170139095182512) /* 0x00009abd9854acb0 */
120#else
121# define GIP_TSC_DELTA_INITIAL_MASTER_VALUE INT64_C(0)
122#endif
123
124AssertCompile(GIP_TSC_DELTA_PRIMER_LOOPS < GIP_TSC_DELTA_READ_TIME_LOOPS);
125AssertCompile(GIP_TSC_DELTA_PRIMER_LOOPS + GIP_TSC_DELTA_READ_TIME_LOOPS < GIP_TSC_DELTA_LOOPS);
126
127/** @def VBOX_SVN_REV
128 * The makefile should define this if it can. */
129#ifndef VBOX_SVN_REV
130# define VBOX_SVN_REV 0
131#endif
132
133#if 0 /* Don't start the GIP timers. Useful when debugging the IPRT timer code. */
134# define DO_NOT_START_GIP
135#endif
136
137
138/*******************************************************************************
139* Internal Functions *
140*******************************************************************************/
141static DECLCALLBACK(void) supdrvGipSyncAndInvariantTimer(PRTTIMER pTimer, void *pvUser, uint64_t iTick);
142static DECLCALLBACK(void) supdrvGipAsyncTimer(PRTTIMER pTimer, void *pvUser, uint64_t iTick);
143static void supdrvGipInitCpu(PSUPGLOBALINFOPAGE pGip, PSUPGIPCPU pCpu, uint64_t u64NanoTS, uint64_t uCpuHz);
144#ifdef SUPDRV_USE_TSC_DELTA_THREAD
145static int supdrvTscDeltaThreadInit(PSUPDRVDEVEXT pDevExt);
146static void supdrvTscDeltaTerm(PSUPDRVDEVEXT pDevExt);
147static int supdrvTscDeltaThreadWaitForOnlineCpus(PSUPDRVDEVEXT pDevExt);
148#endif
149
150
151/*******************************************************************************
152* Global Variables *
153*******************************************************************************/
154DECLEXPORT(PSUPGLOBALINFOPAGE) g_pSUPGlobalInfoPage = NULL;
155
156
157
158/*
159 *
160 * Misc Common GIP Code
161 * Misc Common GIP Code
162 * Misc Common GIP Code
163 *
164 *
165 */
166
167
168/**
169 * Finds the GIP CPU index corresponding to @a idCpu.
170 *
171 * @returns GIP CPU array index, UINT32_MAX if not found.
172 * @param pGip The GIP.
173 * @param idCpu The CPU ID.
174 */
175static uint32_t supdrvGipFindCpuIndexForCpuId(PSUPGLOBALINFOPAGE pGip, RTCPUID idCpu)
176{
177 uint32_t i;
178 for (i = 0; i < pGip->cCpus; i++)
179 if (pGip->aCPUs[i].idCpu == idCpu)
180 return i;
181 return UINT32_MAX;
182}
183
184
185/**
186 * Applies the TSC delta to the supplied raw TSC value.
187 *
188 * @returns VBox status code. (Ignored by all users, just FYI.)
189 * @param pGip Pointer to the GIP.
190 * @param puTsc Pointer to a valid TSC value before the TSC delta has been applied.
191 * @param idApic The APIC ID of the CPU @c puTsc corresponds to.
192 * @param fDeltaApplied Where to store whether the TSC delta was succesfully
193 * applied or not (optional, can be NULL).
194 *
195 * @remarks Maybe called with interrupts disabled in ring-0!
196 *
197 * @note Don't you dare change the delta calculation. If you really do, make
198 * sure you update all places where it's used (IPRT, SUPLibAll.cpp,
199 * SUPDrv.c, supdrvGipMpEvent, and more).
200 */
201DECLINLINE(int) supdrvTscDeltaApply(PSUPGLOBALINFOPAGE pGip, uint64_t *puTsc, uint16_t idApic, bool *pfDeltaApplied)
202{
203 int rc;
204
205 /*
206 * Validate input.
207 */
208 AssertPtr(puTsc);
209 AssertPtr(pGip);
210 Assert(pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED);
211
212 /*
213 * Carefully convert the idApic into a GIPCPU entry.
214 */
215 if (RT_LIKELY(idApic < RT_ELEMENTS(pGip->aiCpuFromApicId)))
216 {
217 uint16_t iCpu = pGip->aiCpuFromApicId[idApic];
218 if (RT_LIKELY(iCpu < pGip->cCpus))
219 {
220 PSUPGIPCPU pGipCpu = &pGip->aCPUs[iCpu];
221
222 /*
223 * Apply the delta if valid.
224 */
225 if (RT_LIKELY(pGipCpu->i64TSCDelta != INT64_MAX))
226 {
227 *puTsc -= pGipCpu->i64TSCDelta;
228 if (pfDeltaApplied)
229 *pfDeltaApplied = true;
230 return VINF_SUCCESS;
231 }
232
233 rc = VINF_SUCCESS;
234 }
235 else
236 {
237 AssertMsgFailed(("iCpu=%u cCpus=%u\n", iCpu, pGip->cCpus));
238 rc = VERR_INVALID_CPU_INDEX;
239 }
240 }
241 else
242 {
243 AssertMsgFailed(("idApic=%u\n", idApic));
244 rc = VERR_INVALID_CPU_ID;
245 }
246 if (pfDeltaApplied)
247 *pfDeltaApplied = false;
248 return rc;
249}
250
251
252/*
253 *
254 * GIP Mapping and Unmapping Related Code.
255 * GIP Mapping and Unmapping Related Code.
256 * GIP Mapping and Unmapping Related Code.
257 *
258 *
259 */
260
261
262/**
263 * (Re-)initializes the per-cpu structure prior to starting or resuming the GIP
264 * updating.
265 *
266 * @param pGip Pointer to the GIP.
267 * @param pGipCpu The per CPU structure for this CPU.
268 * @param u64NanoTS The current time.
269 */
270static void supdrvGipReInitCpu(PSUPGLOBALINFOPAGE pGip, PSUPGIPCPU pGipCpu, uint64_t u64NanoTS)
271{
272 /*
273 * Here we don't really care about applying the TSC delta. The re-initialization of this
274 * value is not relevant especially while (re)starting the GIP as the first few ones will
275 * be ignored anyway, see supdrvGipDoUpdateCpu().
276 */
277 pGipCpu->u64TSC = ASMReadTSC() - pGipCpu->u32UpdateIntervalTSC;
278 pGipCpu->u64NanoTS = u64NanoTS;
279}
280
281
282/**
283 * Set the current TSC and NanoTS value for the CPU.
284 *
285 * @param idCpu The CPU ID. Unused - we have to use the APIC ID.
286 * @param pvUser1 Pointer to the ring-0 GIP mapping.
287 * @param pvUser2 Pointer to the variable holding the current time.
288 */
289static DECLCALLBACK(void) supdrvGipReInitCpuCallback(RTCPUID idCpu, void *pvUser1, void *pvUser2)
290{
291 PSUPGLOBALINFOPAGE pGip = (PSUPGLOBALINFOPAGE)pvUser1;
292 unsigned iCpu = pGip->aiCpuFromApicId[ASMGetApicId()];
293
294 if (RT_LIKELY(iCpu < pGip->cCpus && pGip->aCPUs[iCpu].idCpu == idCpu))
295 supdrvGipReInitCpu(pGip, &pGip->aCPUs[iCpu], *(uint64_t *)pvUser2);
296
297 NOREF(pvUser2);
298 NOREF(idCpu);
299}
300
301
302/**
303 * State structure for supdrvGipDetectGetGipCpuCallback.
304 */
305typedef struct SUPDRVGIPDETECTGETCPU
306{
307 /** Bitmap of APIC IDs that has been seen (initialized to zero).
308 * Used to detect duplicate APIC IDs (paranoia). */
309 uint8_t volatile bmApicId[256 / 8];
310 /** Mask of supported GIP CPU getter methods (SUPGIPGETCPU_XXX) (all bits set
311 * initially). The callback clears the methods not detected. */
312 uint32_t volatile fSupported;
313 /** The first callback detecting any kind of range issues (initialized to
314 * NIL_RTCPUID). */
315 RTCPUID volatile idCpuProblem;
316} SUPDRVGIPDETECTGETCPU;
317/** Pointer to state structure for supdrvGipDetectGetGipCpuCallback. */
318typedef SUPDRVGIPDETECTGETCPU *PSUPDRVGIPDETECTGETCPU;
319
320
321/**
322 * Checks for alternative ways of getting the CPU ID.
323 *
324 * This also checks the APIC ID, CPU ID and CPU set index values against the
325 * GIP tables.
326 *
327 * @param idCpu The CPU ID. Unused - we have to use the APIC ID.
328 * @param pvUser1 Pointer to the state structure.
329 * @param pvUser2 Pointer to the GIP.
330 */
331static DECLCALLBACK(void) supdrvGipDetectGetGipCpuCallback(RTCPUID idCpu, void *pvUser1, void *pvUser2)
332{
333 PSUPDRVGIPDETECTGETCPU pState = (PSUPDRVGIPDETECTGETCPU)pvUser1;
334 PSUPGLOBALINFOPAGE pGip = (PSUPGLOBALINFOPAGE)pvUser2;
335 uint32_t fSupported = 0;
336 uint16_t idApic;
337 int iCpuSet;
338
339 AssertMsg(idCpu == RTMpCpuId(), ("idCpu=%#x RTMpCpuId()=%#x\n", idCpu, RTMpCpuId())); /* paranoia^3 */
340
341 /*
342 * Check that the CPU ID and CPU set index are interchangable.
343 */
344 iCpuSet = RTMpCpuIdToSetIndex(idCpu);
345 if ((RTCPUID)iCpuSet == idCpu)
346 {
347 AssertCompile(RT_IS_POWER_OF_TWO(RTCPUSET_MAX_CPUS));
348 if ( iCpuSet >= 0
349 && iCpuSet < RTCPUSET_MAX_CPUS
350 && RT_IS_POWER_OF_TWO(RTCPUSET_MAX_CPUS))
351 {
352 /*
353 * Check whether the IDTR.LIMIT contains a CPU number.
354 */
355#ifdef RT_ARCH_X86
356 uint16_t const cbIdt = sizeof(X86DESC64SYSTEM) * 256;
357#else
358 uint16_t const cbIdt = sizeof(X86DESCGATE) * 256;
359#endif
360 RTIDTR Idtr;
361 ASMGetIDTR(&Idtr);
362 if (Idtr.cbIdt >= cbIdt)
363 {
364 uint32_t uTmp = Idtr.cbIdt - cbIdt;
365 uTmp &= RTCPUSET_MAX_CPUS - 1;
366 if (uTmp == idCpu)
367 {
368 RTIDTR Idtr2;
369 ASMGetIDTR(&Idtr2);
370 if (Idtr2.cbIdt == Idtr.cbIdt)
371 fSupported |= SUPGIPGETCPU_IDTR_LIMIT_MASK_MAX_SET_CPUS;
372 }
373 }
374
375 /*
376 * Check whether RDTSCP is an option.
377 */
378 if (ASMHasCpuId())
379 {
380 if ( ASMIsValidExtRange(ASMCpuId_EAX(UINT32_C(0x80000000)))
381 && (ASMCpuId_EDX(UINT32_C(0x80000001)) & X86_CPUID_EXT_FEATURE_EDX_RDTSCP) )
382 {
383 uint32_t uAux;
384 ASMReadTscWithAux(&uAux);
385 if ((uAux & (RTCPUSET_MAX_CPUS - 1)) == idCpu)
386 {
387 ASMNopPause();
388 ASMReadTscWithAux(&uAux);
389 if ((uAux & (RTCPUSET_MAX_CPUS - 1)) == idCpu)
390 fSupported |= SUPGIPGETCPU_RDTSCP_MASK_MAX_SET_CPUS;
391 }
392 }
393 }
394 }
395 }
396
397 /*
398 * Check that the APIC ID is unique.
399 */
400 idApic = ASMGetApicId();
401 if (RT_LIKELY( idApic < RT_ELEMENTS(pGip->aiCpuFromApicId)
402 && !ASMAtomicBitTestAndSet(pState->bmApicId, idApic)))
403 fSupported |= SUPGIPGETCPU_APIC_ID;
404 else
405 {
406 AssertCompile(sizeof(pState->bmApicId) * 8 == RT_ELEMENTS(pGip->aiCpuFromApicId));
407 ASMAtomicCmpXchgU32(&pState->idCpuProblem, idCpu, NIL_RTCPUID);
408 LogRel(("supdrvGipDetectGetGipCpuCallback: idCpu=%#x iCpuSet=%d idApic=%#x - duplicate APIC ID.\n",
409 idCpu, iCpuSet, idApic));
410 }
411
412 /*
413 * Check that the iCpuSet is within the expected range.
414 */
415 if (RT_UNLIKELY( iCpuSet < 0
416 || (unsigned)iCpuSet >= RTCPUSET_MAX_CPUS
417 || (unsigned)iCpuSet >= RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)))
418 {
419 ASMAtomicCmpXchgU32(&pState->idCpuProblem, idCpu, NIL_RTCPUID);
420 LogRel(("supdrvGipDetectGetGipCpuCallback: idCpu=%#x iCpuSet=%d idApic=%#x - CPU set index is out of range.\n",
421 idCpu, iCpuSet, idApic));
422 }
423 else
424 {
425 RTCPUID idCpu2 = RTMpCpuIdFromSetIndex(iCpuSet);
426 if (RT_UNLIKELY(idCpu2 != idCpu))
427 {
428 ASMAtomicCmpXchgU32(&pState->idCpuProblem, idCpu, NIL_RTCPUID);
429 LogRel(("supdrvGipDetectGetGipCpuCallback: idCpu=%#x iCpuSet=%d idApic=%#x - CPU id/index roundtrip problem: %#x\n",
430 idCpu, iCpuSet, idApic, idCpu2));
431 }
432 }
433
434 /*
435 * Update the supported feature mask before we return.
436 */
437 ASMAtomicAndU32(&pState->fSupported, fSupported);
438
439 NOREF(pvUser2);
440}
441
442
443/**
444 * Increase the timer freqency on hosts where this is possible (NT).
445 *
446 * The idea is that more interrupts is better for us... Also, it's better than
447 * we increase the timer frequence, because we might end up getting inaccurate
448 * callbacks if someone else does it.
449 *
450 * @param pDevExt Sets u32SystemTimerGranularityGrant if increased.
451 */
452static void supdrvGipRequestHigherTimerFrequencyFromSystem(PSUPDRVDEVEXT pDevExt)
453{
454 if (pDevExt->u32SystemTimerGranularityGrant == 0)
455 {
456 uint32_t u32SystemResolution;
457 if ( RT_SUCCESS_NP(RTTimerRequestSystemGranularity( 976563 /* 1024 HZ */, &u32SystemResolution))
458 || RT_SUCCESS_NP(RTTimerRequestSystemGranularity( 1000000 /* 1000 HZ */, &u32SystemResolution))
459 || RT_SUCCESS_NP(RTTimerRequestSystemGranularity( 1953125 /* 512 HZ */, &u32SystemResolution))
460 || RT_SUCCESS_NP(RTTimerRequestSystemGranularity( 2000000 /* 500 HZ */, &u32SystemResolution))
461 )
462 {
463 Assert(RTTimerGetSystemGranularity() <= u32SystemResolution);
464 pDevExt->u32SystemTimerGranularityGrant = u32SystemResolution;
465 }
466 }
467}
468
469
470/**
471 * Undoes supdrvGipRequestHigherTimerFrequencyFromSystem.
472 *
473 * @param pDevExt Clears u32SystemTimerGranularityGrant.
474 */
475static void supdrvGipReleaseHigherTimerFrequencyFromSystem(PSUPDRVDEVEXT pDevExt)
476{
477 if (pDevExt->u32SystemTimerGranularityGrant)
478 {
479 int rc2 = RTTimerReleaseSystemGranularity(pDevExt->u32SystemTimerGranularityGrant);
480 AssertRC(rc2);
481 pDevExt->u32SystemTimerGranularityGrant = 0;
482 }
483}
484
485
486/**
487 * Maps the GIP into userspace and/or get the physical address of the GIP.
488 *
489 * @returns IPRT status code.
490 * @param pSession Session to which the GIP mapping should belong.
491 * @param ppGipR3 Where to store the address of the ring-3 mapping. (optional)
492 * @param pHCPhysGip Where to store the physical address. (optional)
493 *
494 * @remark There is no reference counting on the mapping, so one call to this function
495 * count globally as one reference. One call to SUPR0GipUnmap() is will unmap GIP
496 * and remove the session as a GIP user.
497 */
498SUPR0DECL(int) SUPR0GipMap(PSUPDRVSESSION pSession, PRTR3PTR ppGipR3, PRTHCPHYS pHCPhysGip)
499{
500 int rc;
501 PSUPDRVDEVEXT pDevExt = pSession->pDevExt;
502 RTR3PTR pGipR3 = NIL_RTR3PTR;
503 RTHCPHYS HCPhys = NIL_RTHCPHYS;
504 LogFlow(("SUPR0GipMap: pSession=%p ppGipR3=%p pHCPhysGip=%p\n", pSession, ppGipR3, pHCPhysGip));
505
506 /*
507 * Validate
508 */
509 AssertReturn(SUP_IS_SESSION_VALID(pSession), VERR_INVALID_PARAMETER);
510 AssertPtrNullReturn(ppGipR3, VERR_INVALID_POINTER);
511 AssertPtrNullReturn(pHCPhysGip, VERR_INVALID_POINTER);
512
513#ifdef SUPDRV_USE_MUTEX_FOR_GIP
514 RTSemMutexRequest(pDevExt->mtxGip, RT_INDEFINITE_WAIT);
515#else
516 RTSemFastMutexRequest(pDevExt->mtxGip);
517#endif
518 if (pDevExt->pGip)
519 {
520 /*
521 * Map it?
522 */
523 rc = VINF_SUCCESS;
524 if (ppGipR3)
525 {
526 if (pSession->GipMapObjR3 == NIL_RTR0MEMOBJ)
527 rc = RTR0MemObjMapUser(&pSession->GipMapObjR3, pDevExt->GipMemObj, (RTR3PTR)-1, 0,
528 RTMEM_PROT_READ, RTR0ProcHandleSelf());
529 if (RT_SUCCESS(rc))
530 pGipR3 = RTR0MemObjAddressR3(pSession->GipMapObjR3);
531 }
532
533 /*
534 * Get physical address.
535 */
536 if (pHCPhysGip && RT_SUCCESS(rc))
537 HCPhys = pDevExt->HCPhysGip;
538
539 /*
540 * Reference globally.
541 */
542 if (!pSession->fGipReferenced && RT_SUCCESS(rc))
543 {
544 pSession->fGipReferenced = 1;
545 pDevExt->cGipUsers++;
546 if (pDevExt->cGipUsers == 1)
547 {
548 PSUPGLOBALINFOPAGE pGipR0 = pDevExt->pGip;
549 uint64_t u64NanoTS;
550
551 /*
552 * GIP starts/resumes updating again. On windows we bump the
553 * host timer frequency to make sure we don't get stuck in guest
554 * mode and to get better timer (and possibly clock) accuracy.
555 */
556 LogFlow(("SUPR0GipMap: Resumes GIP updating\n"));
557
558 supdrvGipRequestHigherTimerFrequencyFromSystem(pDevExt);
559
560 /*
561 * document me
562 */
563 if (pGipR0->aCPUs[0].u32TransactionId != 2 /* not the first time */)
564 {
565 unsigned i;
566 for (i = 0; i < pGipR0->cCpus; i++)
567 ASMAtomicUoWriteU32(&pGipR0->aCPUs[i].u32TransactionId,
568 (pGipR0->aCPUs[i].u32TransactionId + GIP_UPDATEHZ_RECALC_FREQ * 2)
569 & ~(GIP_UPDATEHZ_RECALC_FREQ * 2 - 1));
570 ASMAtomicWriteU64(&pGipR0->u64NanoTSLastUpdateHz, 0);
571 }
572
573 /*
574 * document me
575 */
576 u64NanoTS = RTTimeSystemNanoTS() - pGipR0->u32UpdateIntervalNS;
577 if ( pGipR0->u32Mode == SUPGIPMODE_INVARIANT_TSC
578 || pGipR0->u32Mode == SUPGIPMODE_SYNC_TSC
579 || RTMpGetOnlineCount() == 1)
580 supdrvGipReInitCpu(pGipR0, &pGipR0->aCPUs[0], u64NanoTS);
581 else
582 RTMpOnAll(supdrvGipReInitCpuCallback, pGipR0, &u64NanoTS);
583
584 /*
585 * Detect alternative ways to figure the CPU ID in ring-3 and
586 * raw-mode context. Check the sanity of the APIC IDs, CPU IDs,
587 * and CPU set indexes while we're at it.
588 */
589 if (RT_SUCCESS(rc))
590 {
591 SUPDRVGIPDETECTGETCPU DetectState;
592 RT_BZERO((void *)&DetectState.bmApicId, sizeof(DetectState.bmApicId));
593 DetectState.fSupported = UINT32_MAX;
594 DetectState.idCpuProblem = NIL_RTCPUID;
595 rc = RTMpOnAll(supdrvGipDetectGetGipCpuCallback, &DetectState, pGipR0);
596 if (DetectState.idCpuProblem == NIL_RTCPUID)
597 {
598 if ( DetectState.fSupported != UINT32_MAX
599 && DetectState.fSupported != 0)
600 {
601 if (pGipR0->fGetGipCpu != DetectState.fSupported)
602 {
603 pGipR0->fGetGipCpu = DetectState.fSupported;
604 LogRel(("SUPR0GipMap: fGetGipCpu=%#x\n", DetectState.fSupported));
605 }
606 }
607 else
608 {
609 LogRel(("SUPR0GipMap: No supported ways of getting the APIC ID or CPU number in ring-3! (%#x)\n",
610 DetectState.fSupported));
611 rc = VERR_UNSUPPORTED_CPU;
612 }
613 }
614 else
615 {
616 LogRel(("SUPR0GipMap: APIC ID, CPU ID or CPU set index problem detected on CPU #%u (%#x)!\n",
617 DetectState.idCpuProblem, DetectState.idCpuProblem));
618 rc = VERR_INVALID_CPU_ID;
619 }
620 }
621
622 /*
623 * Start the GIP timer if all is well..
624 */
625 if (RT_SUCCESS(rc))
626 {
627#ifndef DO_NOT_START_GIP
628 rc = RTTimerStart(pDevExt->pGipTimer, 0 /* fire ASAP */); AssertRC(rc);
629#endif
630 rc = VINF_SUCCESS;
631 }
632
633 /*
634 * Bail out on error.
635 */
636 if (RT_FAILURE(rc))
637 {
638 LogRel(("SUPR0GipMap: failed rc=%Rrc\n", rc));
639 pDevExt->cGipUsers = 0;
640 pSession->fGipReferenced = 0;
641 if (pSession->GipMapObjR3 != NIL_RTR0MEMOBJ)
642 {
643 int rc2 = RTR0MemObjFree(pSession->GipMapObjR3, false); AssertRC(rc2);
644 if (RT_SUCCESS(rc2))
645 pSession->GipMapObjR3 = NIL_RTR0MEMOBJ;
646 }
647 HCPhys = NIL_RTHCPHYS;
648 pGipR3 = NIL_RTR3PTR;
649 }
650 }
651 }
652 }
653 else
654 {
655 rc = VERR_GENERAL_FAILURE;
656 Log(("SUPR0GipMap: GIP is not available!\n"));
657 }
658#ifdef SUPDRV_USE_MUTEX_FOR_GIP
659 RTSemMutexRelease(pDevExt->mtxGip);
660#else
661 RTSemFastMutexRelease(pDevExt->mtxGip);
662#endif
663
664 /*
665 * Write returns.
666 */
667 if (pHCPhysGip)
668 *pHCPhysGip = HCPhys;
669 if (ppGipR3)
670 *ppGipR3 = pGipR3;
671
672#ifdef DEBUG_DARWIN_GIP
673 OSDBGPRINT(("SUPR0GipMap: returns %d *pHCPhysGip=%lx pGipR3=%p\n", rc, (unsigned long)HCPhys, (void *)pGipR3));
674#else
675 LogFlow(( "SUPR0GipMap: returns %d *pHCPhysGip=%lx pGipR3=%p\n", rc, (unsigned long)HCPhys, (void *)pGipR3));
676#endif
677 return rc;
678}
679
680
681/**
682 * Unmaps any user mapping of the GIP and terminates all GIP access
683 * from this session.
684 *
685 * @returns IPRT status code.
686 * @param pSession Session to which the GIP mapping should belong.
687 */
688SUPR0DECL(int) SUPR0GipUnmap(PSUPDRVSESSION pSession)
689{
690 int rc = VINF_SUCCESS;
691 PSUPDRVDEVEXT pDevExt = pSession->pDevExt;
692#ifdef DEBUG_DARWIN_GIP
693 OSDBGPRINT(("SUPR0GipUnmap: pSession=%p pGip=%p GipMapObjR3=%p\n",
694 pSession,
695 pSession->GipMapObjR3 != NIL_RTR0MEMOBJ ? RTR0MemObjAddress(pSession->GipMapObjR3) : NULL,
696 pSession->GipMapObjR3));
697#else
698 LogFlow(("SUPR0GipUnmap: pSession=%p\n", pSession));
699#endif
700 AssertReturn(SUP_IS_SESSION_VALID(pSession), VERR_INVALID_PARAMETER);
701
702#ifdef SUPDRV_USE_MUTEX_FOR_GIP
703 RTSemMutexRequest(pDevExt->mtxGip, RT_INDEFINITE_WAIT);
704#else
705 RTSemFastMutexRequest(pDevExt->mtxGip);
706#endif
707
708 /*
709 * Unmap anything?
710 */
711 if (pSession->GipMapObjR3 != NIL_RTR0MEMOBJ)
712 {
713 rc = RTR0MemObjFree(pSession->GipMapObjR3, false);
714 AssertRC(rc);
715 if (RT_SUCCESS(rc))
716 pSession->GipMapObjR3 = NIL_RTR0MEMOBJ;
717 }
718
719 /*
720 * Dereference global GIP.
721 */
722 if (pSession->fGipReferenced && !rc)
723 {
724 pSession->fGipReferenced = 0;
725 if ( pDevExt->cGipUsers > 0
726 && !--pDevExt->cGipUsers)
727 {
728 LogFlow(("SUPR0GipUnmap: Suspends GIP updating\n"));
729#ifndef DO_NOT_START_GIP
730 rc = RTTimerStop(pDevExt->pGipTimer); AssertRC(rc); rc = VINF_SUCCESS;
731#endif
732 supdrvGipReleaseHigherTimerFrequencyFromSystem(pDevExt);
733 }
734 }
735
736#ifdef SUPDRV_USE_MUTEX_FOR_GIP
737 RTSemMutexRelease(pDevExt->mtxGip);
738#else
739 RTSemFastMutexRelease(pDevExt->mtxGip);
740#endif
741
742 return rc;
743}
744
745
746/**
747 * Gets the GIP pointer.
748 *
749 * @returns Pointer to the GIP or NULL.
750 */
751SUPDECL(PSUPGLOBALINFOPAGE) SUPGetGIP(void)
752{
753 return g_pSUPGlobalInfoPage;
754}
755
756
757
758
759
760/*
761 *
762 *
763 * GIP Initialization, Termination and CPU Offline / Online Related Code.
764 * GIP Initialization, Termination and CPU Offline / Online Related Code.
765 * GIP Initialization, Termination and CPU Offline / Online Related Code.
766 *
767 *
768 */
769
770static void supdrvGipInitSetCpuFreq(PSUPGLOBALINFOPAGE pGip, uint64_t nsElapsed, uint64_t cElapsedTscTicks)
771{
772 /*
773 * Calculate the frequency.
774 */
775 uint64_t uCpuHz;
776 if ( cElapsedTscTicks < UINT64_MAX / RT_NS_1SEC
777 && nsElapsed < UINT32_MAX)
778 uCpuHz = ASMMultU64ByU32DivByU32(cElapsedTscTicks, RT_NS_1SEC, (uint32_t)nsElapsed);
779 else
780 {
781 RTUINT128U CpuHz, Tmp, Divisor;
782 CpuHz.s.Lo = CpuHz.s.Hi = 0;
783 RTUInt128MulU64ByU64(&Tmp, cElapsedTscTicks, RT_NS_1SEC_64);
784 RTUInt128Div(&CpuHz, &Tmp, RTUInt128AssignU64(&Divisor, nsElapsed));
785 uCpuHz = CpuHz.s.Lo;
786 }
787
788 /*
789 * Update the GIP.
790 */
791 ASMAtomicWriteU64(&pGip->u64CpuHz, uCpuHz);
792 if (pGip->u32Mode != SUPGIPMODE_ASYNC_TSC)
793 ASMAtomicWriteU64(&pGip->aCPUs[0].u64CpuHz, uCpuHz);
794}
795
796
797/**
798 * Timer callback function for TSC frequency refinement in invariant GIP mode.
799 *
800 * This is started during driver init and fires once
801 * GIP_TSC_REFINE_PREIOD_IN_SECS seconds later.
802 *
803 * @param pTimer The timer.
804 * @param pvUser Opaque pointer to the device instance data.
805 * @param iTick The timer tick.
806 */
807static DECLCALLBACK(void) supdrvInitRefineInvariantTscTimer(PRTTIMER pTimer, void *pvUser, uint64_t iTick)
808{
809 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser;
810 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
811 RTCPUID idCpu;
812 uint64_t cNsElapsed;
813 uint64_t cTscTicksElapsed;
814 uint64_t nsNow;
815 uint64_t uTsc;
816 RTCCUINTREG uFlags;
817
818 /* Paranoia. */
819 AssertReturnVoid(pGip);
820 AssertReturnVoid(pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC);
821
822 /*
823 * Try get close to the next clock tick as usual.
824 *
825 * PORTME: If timers are called from the clock interrupt handler, or
826 * an interrupt handler with higher priority than the clock
827 * interrupt, or spinning for ages in timer handlers is frowned
828 * upon, this look must be disabled!
829 *
830 * Darwin, FreeBSD, Linux, Solaris, Windows 8.1+:
831 * High RTTimeSystemNanoTS resolution should prevent any noticable
832 * spinning her.
833 *
834 * Windows 8.0 and earlier:
835 * We're running in a DPC here, so we may trigger the DPC watchdog?
836 *
837 * OS/2:
838 * Timer callbacks are done in the clock interrupt, so skip it.
839 */
840#if !defined(RT_OS_OS2)
841 nsNow = RTTimeSystemNanoTS();
842 while (RTTimeSystemNanoTS() == nsNow)
843 ASMNopPause();
844#endif
845
846 uFlags = ASMIntDisableFlags();
847 uTsc = ASMReadTSC();
848 nsNow = RTTimeSystemNanoTS();
849 idCpu = RTMpCpuId();
850 ASMSetFlags(uFlags);
851
852 cNsElapsed = nsNow - pDevExt->nsStartInvarTscRefine;
853 cTscTicksElapsed = uTsc - pDevExt->uTscStartInvarTscRefine;
854
855 /*
856 * If the above measurement was taken on a different CPU than the one we
857 * started the rprocess on, cTscTicksElapsed will need to be adjusted with
858 * the TSC deltas of both the CPUs.
859 *
860 * We ASSUME that the delta calculation process takes less time than the
861 * TSC frequency refinement timer. If it doesn't, we'll complain and
862 * drop the frequency refinement.
863 *
864 * Note! We cannot entirely trust enmUseTscDelta here because it's
865 * downgraded after each delta calculation.
866 */
867 if ( idCpu != pDevExt->idCpuInvarTscRefine
868 && pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
869 {
870 uint32_t iStartCpuSet = RTMpCpuIdToSetIndex(pDevExt->idCpuInvarTscRefine);
871 uint32_t iStopCpuSet = RTMpCpuIdToSetIndex(idCpu);
872 uint16_t iStartGipCpu = iStartCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)
873 ? pGip->aiCpuFromCpuSetIdx[iStartCpuSet] : UINT16_MAX;
874 uint16_t iStopGipCpu = iStopCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)
875 ? pGip->aiCpuFromCpuSetIdx[iStopCpuSet] : UINT16_MAX;
876 int64_t iStartTscDelta = iStartGipCpu < pGip->cCpus ? pGip->aCPUs[iStartGipCpu].i64TSCDelta : INT64_MAX;
877 int64_t iStopTscDelta = iStopGipCpu < pGip->cCpus ? pGip->aCPUs[iStopGipCpu].i64TSCDelta : INT64_MAX;
878 if (RT_LIKELY(iStartTscDelta != INT64_MAX && iStopGipCpu != INT64_MAX))
879 {
880 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_PRACTICALLY_ZERO)
881 {
882 /* cTscTicksElapsed = (uTsc - iStopTscDelta) - (pDevExt->uTscStartInvarTscRefine - iStartTscDelta); */
883 cTscTicksElapsed += iStartTscDelta - iStopTscDelta;
884 }
885 }
886 /*
887 * Allow 5 times the refinement period to elapse before we give up on the TSC delta
888 * calculations.
889 */
890 else if (cNsElapsed <= GIP_TSC_REFINE_PREIOD_IN_SECS * 5 * RT_NS_1SEC_64)
891 {
892 int rc = RTTimerStart(pTimer, RT_NS_1SEC);
893 AssertRC(rc);
894 return;
895 }
896 else
897 {
898 SUPR0Printf("vboxdrv: Failed to refine invariant TSC frequency because deltas are unavailable after %u (%u) seconds\n",
899 (uint32_t)(cNsElapsed / RT_NS_1SEC), GIP_TSC_REFINE_PREIOD_IN_SECS);
900 SUPR0Printf("vboxdrv: start: %u, %u, %#llx stop: %u, %u, %#llx\n",
901 iStartCpuSet, iStartGipCpu, iStartTscDelta, iStopCpuSet, iStopGipCpu, iStopTscDelta);
902 return;
903 }
904 }
905
906 /*
907 * Calculate and update the CPU frequency variables in GIP.
908 *
909 * If there is a GIP user already and we've already refined the frequency
910 * a couple of times, don't update it as we want a stable frequency value
911 * for all VMs.
912 */
913 if ( pDevExt->cGipUsers == 0
914 || cNsElapsed < RT_NS_1SEC * 2)
915 {
916 supdrvGipInitSetCpuFreq(pGip, cNsElapsed, cTscTicksElapsed);
917
918 /*
919 * Reschedule the timer if we haven't yet reached the defined refinement period.
920 */
921 if (cNsElapsed < GIP_TSC_REFINE_PREIOD_IN_SECS * RT_NS_1SEC_64)
922 {
923 int rc = RTTimerStart(pTimer, RT_NS_1SEC);
924 AssertRC(rc);
925 }
926 }
927}
928
929
930/**
931 * Start the TSC-frequency refinment timer for the invariant TSC GIP mode.
932 *
933 * We cannot use this in the synchronous and asynchronous tsc GIP modes because
934 * the CPU may change the TSC frequence between now and when the timer fires
935 * (supdrvInitAsyncRefineTscTimer).
936 *
937 * @param pDevExt Pointer to the device instance data.
938 * @param pGip Pointer to the GIP.
939 *
940 * @remarks We cannot use this
941 */
942static void supdrvGipInitStartTimerForRefiningInvariantTscFreq(PSUPDRVDEVEXT pDevExt, PSUPGLOBALINFOPAGE pGip)
943{
944 uint64_t u64NanoTS;
945 RTCCUINTREG uFlags;
946 int rc;
947
948 /*
949 * Record the TSC and NanoTS as the starting anchor point for refinement
950 * of the TSC. We try get as close to a clock tick as possible on systems
951 * which does not provide high resolution time.
952 */
953 u64NanoTS = RTTimeSystemNanoTS();
954 while (RTTimeSystemNanoTS() == u64NanoTS)
955 ASMNopPause();
956
957 uFlags = ASMIntDisableFlags();
958 pDevExt->uTscStartInvarTscRefine = ASMReadTSC();
959 pDevExt->nsStartInvarTscRefine = RTTimeSystemNanoTS();
960 pDevExt->idCpuInvarTscRefine = RTMpCpuId();
961 ASMSetFlags(uFlags);
962
963 /*
964 * Create a timer that runs on the same CPU so we won't have a depencency
965 * on the TSC-delta and can run in parallel to it. On systems that does not
966 * implement CPU specific timers we'll apply deltas in the timer callback,
967 * just like we do for CPUs going offline.
968 *
969 * The longer the refinement interval the better the accuracy, at least in
970 * theory. If it's too long though, ring-3 may already be starting its
971 * first VMs before we're done. On most systems we will be loading the
972 * support driver during boot and VMs won't be started for a while yet,
973 * it is really only a problem during development (especiall with
974 * on-demand driver starting on windows).
975 *
976 * To avoid wasting time doing a long supdrvGipInitMeasureTscFreq call
977 * to calculate the frequencey during driver loading, the timer is set
978 * to fire after 200 ms the first time. It will then reschedule itself
979 * to fire every second until GIP_TSC_REFINE_PREIOD_IN_SECS has been
980 * reached or it notices that there is a user land client with GIP
981 * mapped (we want a stable frequency for all VMs).
982 */
983 rc = RTTimerCreateEx(&pDevExt->pInvarTscRefineTimer, 0 /* one-shot */,
984 RTTIMER_FLAGS_CPU(RTMpCpuIdToSetIndex(pDevExt->idCpuInvarTscRefine)),
985 supdrvInitRefineInvariantTscTimer, pDevExt);
986 if (RT_SUCCESS(rc))
987 {
988 rc = RTTimerStart(pDevExt->pInvarTscRefineTimer, 2*RT_NS_100MS);
989 if (RT_SUCCESS(rc))
990 return;
991 RTTimerDestroy(pDevExt->pInvarTscRefineTimer);
992 }
993
994 if (rc == VERR_CPU_OFFLINE || rc == VERR_NOT_SUPPORTED)
995 {
996 rc = RTTimerCreateEx(&pDevExt->pInvarTscRefineTimer, 0 /* one-shot */, RTTIMER_FLAGS_CPU_ANY,
997 supdrvInitRefineInvariantTscTimer, pDevExt);
998 if (RT_SUCCESS(rc))
999 {
1000 rc = RTTimerStart(pDevExt->pInvarTscRefineTimer, 2*RT_NS_100MS);
1001 if (RT_SUCCESS(rc))
1002 return;
1003 RTTimerDestroy(pDevExt->pInvarTscRefineTimer);
1004 }
1005 }
1006
1007 pDevExt->pInvarTscRefineTimer = NULL;
1008 OSDBGPRINT(("vboxdrv: Failed to create or start TSC frequency refinement timer: rc=%Rrc\n", rc));
1009}
1010
1011
1012/**
1013 * @callback_method_impl{PFNRTMPWORKER,
1014 * RTMpOnSpecific callback for reading TSC and time on the CPU we started
1015 * the measurements on.}
1016 */
1017DECLCALLBACK(void) supdrvGipInitReadTscAndNanoTsOnCpu(RTCPUID idCpu, void *pvUser1, void *pvUser2)
1018{
1019 RTCCUINTREG uFlags = ASMIntDisableFlags();
1020 uint64_t *puTscStop = (uint64_t *)pvUser1;
1021 uint64_t *pnsStop = (uint64_t *)pvUser2;
1022
1023 *puTscStop = ASMReadTSC();
1024 *pnsStop = RTTimeSystemNanoTS();
1025
1026 ASMSetFlags(uFlags);
1027}
1028
1029
1030/**
1031 * Measures the TSC frequency of the system.
1032 *
1033 * The TSC frequency can vary on systems which are not reported as invariant.
1034 * On such systems the object of this function is to find out what the nominal,
1035 * maximum TSC frequency under 'normal' CPU operation.
1036 *
1037 * @returns VBox status code.
1038 * @param pDevExt Pointer to the device instance.
1039 * @param pGip Pointer to the GIP.
1040 * @param fRough Set if we're doing the rough calculation that the
1041 * TSC measuring code needs, where accuracy isn't all
1042 * that important (too high is better than to low).
1043 * When clear we try for best accuracy that we can
1044 * achieve in reasonably short time.
1045 */
1046static int supdrvGipInitMeasureTscFreq(PSUPDRVDEVEXT pDevExt, PSUPGLOBALINFOPAGE pGip, bool fRough)
1047{
1048 uint32_t nsTimerIncr = RTTimerGetSystemGranularity();
1049 int cTriesLeft = fRough ? 4 : 2;
1050 while (cTriesLeft-- > 0)
1051 {
1052 RTCCUINTREG uFlags;
1053 uint64_t nsStart;
1054 uint64_t nsStop;
1055 uint64_t uTscStart;
1056 uint64_t uTscStop;
1057 RTCPUID idCpuStart;
1058 RTCPUID idCpuStop;
1059
1060 /*
1061 * Synchronize with the host OS clock tick on systems without high
1062 * resolution time API (older Windows version for example).
1063 */
1064 nsStart = RTTimeSystemNanoTS();
1065 while (RTTimeSystemNanoTS() == nsStart)
1066 ASMNopPause();
1067
1068 /*
1069 * Read the TSC and current time, noting which CPU we're on.
1070 */
1071 uFlags = ASMIntDisableFlags();
1072 uTscStart = ASMReadTSC();
1073 nsStart = RTTimeSystemNanoTS();
1074 idCpuStart = RTMpCpuId();
1075 ASMSetFlags(uFlags);
1076
1077 /*
1078 * Delay for a while.
1079 */
1080 if (pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC)
1081 {
1082 /*
1083 * Sleep-wait since the TSC frequency is constant, it eases host load.
1084 * Shorter interval produces more variance in the frequency (esp. Windows).
1085 */
1086 uint64_t msElapsed = 0;
1087 uint64_t msDelay = ( ((fRough ? 16 : 200) * RT_NS_1MS + nsTimerIncr - 1) / nsTimerIncr * nsTimerIncr - RT_NS_100US )
1088 / RT_NS_1MS;
1089 do
1090 {
1091 RTThreadSleep((RTMSINTERVAL)(msDelay - msElapsed));
1092 nsStop = RTTimeSystemNanoTS();
1093 msElapsed = (nsStop - nsStart) / RT_NS_1MS;
1094 } while (msElapsed < msDelay);
1095
1096 while (RTTimeSystemNanoTS() == nsStop)
1097 ASMNopPause();
1098 }
1099 else
1100 {
1101 /*
1102 * Busy-wait keeping the frequency up.
1103 */
1104 do
1105 {
1106 ASMNopPause();
1107 nsStop = RTTimeSystemNanoTS();
1108 } while (nsStop - nsStart < RT_NS_100MS);
1109 }
1110
1111 /*
1112 * Read the TSC and time again.
1113 */
1114 uFlags = ASMIntDisableFlags();
1115 uTscStop = ASMReadTSC();
1116 nsStop = RTTimeSystemNanoTS();
1117 idCpuStop = RTMpCpuId();
1118 ASMSetFlags(uFlags);
1119
1120 /*
1121 * If the CPU changes things get a bit complicated and what we
1122 * can get away with depends on the GIP mode / TSC reliablity.
1123 */
1124 if (idCpuStop != idCpuStart)
1125 {
1126 bool fDoXCall = false;
1127
1128 /*
1129 * Synchronous TSC mode: we're probably fine as it's unlikely
1130 * that we were rescheduled because of TSC throttling or power
1131 * management reasons, so just go ahead.
1132 */
1133 if (pGip->u32Mode == SUPGIPMODE_SYNC_TSC)
1134 {
1135 /* Probably ok, maybe we should retry once?. */
1136 Assert(pGip->enmUseTscDelta == SUPGIPUSETSCDELTA_NOT_APPLICABLE);
1137 }
1138 /*
1139 * If we're just doing the rough measurement, do the cross call and
1140 * get on with things (we don't have deltas!).
1141 */
1142 else if (fRough)
1143 fDoXCall = true;
1144 /*
1145 * Invariant TSC mode: It doesn't matter if we have delta available
1146 * for both CPUs. That is not something we can assume at this point.
1147 *
1148 * Note! We cannot necessarily trust enmUseTscDelta here because it's
1149 * downgraded after each delta calculation and the delta
1150 * calculations may not be complete yet.
1151 */
1152 else if (pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC)
1153 {
1154 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
1155 {
1156 uint32_t iStartCpuSet = RTMpCpuIdToSetIndex(idCpuStart);
1157 uint32_t iStopCpuSet = RTMpCpuIdToSetIndex(idCpuStop);
1158 uint16_t iStartGipCpu = iStartCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)
1159 ? pGip->aiCpuFromCpuSetIdx[iStartCpuSet] : UINT16_MAX;
1160 uint16_t iStopGipCpu = iStopCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)
1161 ? pGip->aiCpuFromCpuSetIdx[iStopCpuSet] : UINT16_MAX;
1162 int64_t iStartTscDelta = iStartGipCpu < pGip->cCpus ? pGip->aCPUs[iStartGipCpu].i64TSCDelta : INT64_MAX;
1163 int64_t iStopTscDelta = iStopGipCpu < pGip->cCpus ? pGip->aCPUs[iStopGipCpu].i64TSCDelta : INT64_MAX;
1164 if (RT_LIKELY(iStartTscDelta != INT64_MAX && iStopGipCpu != INT64_MAX))
1165 {
1166 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_PRACTICALLY_ZERO)
1167 {
1168 uTscStart -= iStartTscDelta;
1169 uTscStop -= iStopTscDelta;
1170 }
1171 }
1172 /*
1173 * Invalid CPU indexes are not caused by online/offline races, so
1174 * we have to trigger driver load failure if that happens as GIP
1175 * and IPRT assumptions are busted on this system.
1176 */
1177 else if (iStopGipCpu >= pGip->cCpus || iStartGipCpu >= pGip->cCpus)
1178 {
1179 SUPR0Printf("vboxdrv: Unexpected CPU index in supdrvGipInitMeasureTscFreq.\n");
1180 SUPR0Printf("vboxdrv: start: %u, %u, %#llx stop: %u, %u, %#llx\n",
1181 iStartCpuSet, iStartGipCpu, iStartTscDelta, iStopCpuSet, iStopGipCpu, iStopTscDelta);
1182 return VERR_INVALID_CPU_INDEX;
1183 }
1184 /*
1185 * No valid deltas. We retry, if we're on our last retry
1186 * we do the cross call instead just to get a result. The
1187 * frequency will be refined in a few seconds anyways.
1188 */
1189 else if (cTriesLeft > 0)
1190 continue;
1191 else
1192 fDoXCall = true;
1193 }
1194 }
1195 /*
1196 * Asynchronous TSC mode: This is bad as the reason we usually
1197 * use this mode is to deal with variable TSC frequencies and
1198 * deltas. So, we need to get the TSC from the same CPU as
1199 * started it, we also need to keep that CPU busy. So, retry
1200 * and fall back to the cross call on the last attempt.
1201 */
1202 else
1203 {
1204 Assert(pGip->u32Mode == SUPGIPMODE_ASYNC_TSC);
1205 if (cTriesLeft > 0)
1206 continue;
1207 fDoXCall = true;
1208 }
1209
1210 if (fDoXCall)
1211 {
1212 /*
1213 * Try read the TSC and timestamp on the start CPU.
1214 */
1215 int rc = RTMpOnSpecific(idCpuStart, supdrvGipInitReadTscAndNanoTsOnCpu, &uTscStop, &nsStop);
1216 if (RT_FAILURE(rc) && (!fRough || cTriesLeft > 0))
1217 continue;
1218 }
1219 }
1220
1221 /*
1222 * Calculate the TSC frequency and update it (shared with the refinement timer).
1223 */
1224 supdrvGipInitSetCpuFreq(pGip, nsStop - nsStart, uTscStop - uTscStart);
1225 return VINF_SUCCESS;
1226 }
1227
1228 Assert(!fRough);
1229 return VERR_SUPDRV_TSC_FREQ_MEASUREMENT_FAILED;
1230}
1231
1232
1233/**
1234 * Finds our (@a idCpu) entry, or allocates a new one if not found.
1235 *
1236 * @returns Index of the CPU in the cache set.
1237 * @param pGip The GIP.
1238 * @param idCpu The CPU ID.
1239 */
1240static uint32_t supdrvGipFindOrAllocCpuIndexForCpuId(PSUPGLOBALINFOPAGE pGip, RTCPUID idCpu)
1241{
1242 uint32_t i, cTries;
1243
1244 /*
1245 * ASSUMES that CPU IDs are constant.
1246 */
1247 for (i = 0; i < pGip->cCpus; i++)
1248 if (pGip->aCPUs[i].idCpu == idCpu)
1249 return i;
1250
1251 cTries = 0;
1252 do
1253 {
1254 for (i = 0; i < pGip->cCpus; i++)
1255 {
1256 bool fRc;
1257 ASMAtomicCmpXchgSize(&pGip->aCPUs[i].idCpu, idCpu, NIL_RTCPUID, fRc);
1258 if (fRc)
1259 return i;
1260 }
1261 } while (cTries++ < 32);
1262 AssertReleaseFailed();
1263 return i - 1;
1264}
1265
1266
1267/**
1268 * The calling CPU should be accounted as online, update GIP accordingly.
1269 *
1270 * This is used by supdrvGipCreate() as well as supdrvGipMpEvent().
1271 *
1272 * @param pDevExt The device extension.
1273 * @param idCpu The CPU ID.
1274 */
1275static void supdrvGipMpEventOnlineOrInitOnCpu(PSUPDRVDEVEXT pDevExt, RTCPUID idCpu)
1276{
1277 int iCpuSet = 0;
1278 uint16_t idApic = UINT16_MAX;
1279 uint32_t i = 0;
1280 uint64_t u64NanoTS = 0;
1281 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
1282
1283 AssertPtrReturnVoid(pGip);
1284 AssertRelease(idCpu == RTMpCpuId());
1285 Assert(pGip->cPossibleCpus == RTMpGetCount());
1286
1287 /*
1288 * Do this behind a spinlock with interrupts disabled as this can fire
1289 * on all CPUs simultaneously, see @bugref{6110}.
1290 */
1291 RTSpinlockAcquire(pDevExt->hGipSpinlock);
1292
1293 /*
1294 * Update the globals.
1295 */
1296 ASMAtomicWriteU16(&pGip->cPresentCpus, RTMpGetPresentCount());
1297 ASMAtomicWriteU16(&pGip->cOnlineCpus, RTMpGetOnlineCount());
1298 iCpuSet = RTMpCpuIdToSetIndex(idCpu);
1299 if (iCpuSet >= 0)
1300 {
1301 Assert(RTCpuSetIsMemberByIndex(&pGip->PossibleCpuSet, iCpuSet));
1302 RTCpuSetAddByIndex(&pGip->OnlineCpuSet, iCpuSet);
1303 RTCpuSetAddByIndex(&pGip->PresentCpuSet, iCpuSet);
1304 }
1305
1306 /*
1307 * Update the entry.
1308 */
1309 u64NanoTS = RTTimeSystemNanoTS() - pGip->u32UpdateIntervalNS;
1310 i = supdrvGipFindOrAllocCpuIndexForCpuId(pGip, idCpu);
1311
1312 supdrvGipInitCpu(pGip, &pGip->aCPUs[i], u64NanoTS, pGip->u64CpuHz);
1313
1314 idApic = ASMGetApicId();
1315 ASMAtomicWriteU16(&pGip->aCPUs[i].idApic, idApic);
1316 ASMAtomicWriteS16(&pGip->aCPUs[i].iCpuSet, (int16_t)iCpuSet);
1317 ASMAtomicWriteSize(&pGip->aCPUs[i].idCpu, idCpu);
1318
1319 /*
1320 * Update the APIC ID and CPU set index mappings.
1321 */
1322 ASMAtomicWriteU16(&pGip->aiCpuFromApicId[idApic], i);
1323 ASMAtomicWriteU16(&pGip->aiCpuFromCpuSetIdx[iCpuSet], i);
1324
1325 /* Update the Mp online/offline counter. */
1326 ASMAtomicIncU32(&pDevExt->cMpOnOffEvents);
1327
1328 /* Add this CPU to the set of CPUs for which we need to calculate their TSC-deltas. */
1329 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
1330 {
1331 RTCpuSetAddByIndex(&pDevExt->TscDeltaCpuSet, iCpuSet);
1332#ifdef SUPDRV_USE_TSC_DELTA_THREAD
1333 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
1334 if ( pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_Listening
1335 || pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_Measuring)
1336 {
1337 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_WaitAndMeasure;
1338 }
1339 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
1340#endif
1341 }
1342
1343 /* commit it */
1344 ASMAtomicWriteSize(&pGip->aCPUs[i].enmState, SUPGIPCPUSTATE_ONLINE);
1345
1346 RTSpinlockRelease(pDevExt->hGipSpinlock);
1347}
1348
1349
1350/**
1351 * The CPU should be accounted as offline, update the GIP accordingly.
1352 *
1353 * This is used by supdrvGipMpEvent.
1354 *
1355 * @param pDevExt The device extension.
1356 * @param idCpu The CPU ID.
1357 */
1358static void supdrvGipMpEventOffline(PSUPDRVDEVEXT pDevExt, RTCPUID idCpu)
1359{
1360 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
1361 int iCpuSet;
1362 unsigned i;
1363
1364 AssertPtrReturnVoid(pGip);
1365 RTSpinlockAcquire(pDevExt->hGipSpinlock);
1366
1367 iCpuSet = RTMpCpuIdToSetIndex(idCpu);
1368 AssertReturnVoid(iCpuSet >= 0);
1369
1370 i = pGip->aiCpuFromCpuSetIdx[iCpuSet];
1371 AssertReturnVoid(i < pGip->cCpus);
1372 AssertReturnVoid(pGip->aCPUs[i].idCpu == idCpu);
1373
1374 Assert(RTCpuSetIsMemberByIndex(&pGip->PossibleCpuSet, iCpuSet));
1375 RTCpuSetDelByIndex(&pGip->OnlineCpuSet, iCpuSet);
1376
1377 /* Update the Mp online/offline counter. */
1378 ASMAtomicIncU32(&pDevExt->cMpOnOffEvents);
1379
1380 /* If we are the initiator going offline while measuring the TSC delta, unspin other waiting CPUs! */
1381 if (ASMAtomicReadU32(&pDevExt->idTscDeltaInitiator) == idCpu)
1382 {
1383 ASMAtomicWriteU32(&pDevExt->pTscDeltaSync->u, GIP_TSC_DELTA_SYNC_START);
1384 ASMAtomicWriteU64(&pGip->aCPUs[i].u64TSCSample, ~GIP_TSC_DELTA_RSVD);
1385 }
1386
1387 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
1388 {
1389 /* Reset the TSC delta, we will recalculate it lazily. */
1390 ASMAtomicWriteS64(&pGip->aCPUs[i].i64TSCDelta, INT64_MAX);
1391 /* Remove this CPU from the set of CPUs that we have obtained the TSC deltas. */
1392 RTCpuSetDelByIndex(&pDevExt->TscDeltaObtainedCpuSet, iCpuSet);
1393 }
1394
1395 /* commit it */
1396 ASMAtomicWriteSize(&pGip->aCPUs[i].enmState, SUPGIPCPUSTATE_OFFLINE);
1397
1398 RTSpinlockRelease(pDevExt->hGipSpinlock);
1399}
1400
1401
1402/**
1403 * Multiprocessor event notification callback.
1404 *
1405 * This is used to make sure that the GIP master gets passed on to
1406 * another CPU. It also updates the associated CPU data.
1407 *
1408 * @param enmEvent The event.
1409 * @param idCpu The cpu it applies to.
1410 * @param pvUser Pointer to the device extension.
1411 *
1412 * @remarks This function -must- fire on the newly online'd CPU for the
1413 * RTMPEVENT_ONLINE case and can fire on any CPU for the
1414 * RTMPEVENT_OFFLINE case.
1415 */
1416static DECLCALLBACK(void) supdrvGipMpEvent(RTMPEVENT enmEvent, RTCPUID idCpu, void *pvUser)
1417{
1418 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser;
1419 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
1420
1421 AssertRelease(!RTThreadPreemptIsEnabled(NIL_RTTHREAD));
1422
1423 /*
1424 * Update the GIP CPU data.
1425 */
1426 if (pGip)
1427 {
1428 switch (enmEvent)
1429 {
1430 case RTMPEVENT_ONLINE:
1431 AssertRelease(idCpu == RTMpCpuId());
1432 supdrvGipMpEventOnlineOrInitOnCpu(pDevExt, idCpu);
1433 break;
1434 case RTMPEVENT_OFFLINE:
1435 supdrvGipMpEventOffline(pDevExt, idCpu);
1436 break;
1437 }
1438 }
1439
1440 /*
1441 * Make sure there is a master GIP.
1442 */
1443 if (enmEvent == RTMPEVENT_OFFLINE)
1444 {
1445 RTCPUID idGipMaster = ASMAtomicReadU32(&pDevExt->idGipMaster);
1446 if (idGipMaster == idCpu)
1447 {
1448 /*
1449 * The GIP master is going offline, find a new one.
1450 */
1451 bool fIgnored;
1452 unsigned i;
1453 RTCPUID idNewGipMaster = NIL_RTCPUID;
1454 RTCPUSET OnlineCpus;
1455 RTMpGetOnlineSet(&OnlineCpus);
1456
1457 for (i = 0; i < RTCPUSET_MAX_CPUS; i++)
1458 if (RTCpuSetIsMemberByIndex(&OnlineCpus, i))
1459 {
1460 RTCPUID idCurCpu = RTMpCpuIdFromSetIndex(i);
1461 if (idCurCpu != idGipMaster)
1462 {
1463 idNewGipMaster = idCurCpu;
1464 break;
1465 }
1466 }
1467
1468 Log(("supdrvGipMpEvent: Gip master %#lx -> %#lx\n", (long)idGipMaster, (long)idNewGipMaster));
1469 ASMAtomicCmpXchgSize(&pDevExt->idGipMaster, idNewGipMaster, idGipMaster, fIgnored);
1470 NOREF(fIgnored);
1471 }
1472 }
1473}
1474
1475
1476/**
1477 * On CPU initialization callback for RTMpOnAll.
1478 *
1479 * @param idCpu The CPU ID.
1480 * @param pvUser1 The device extension.
1481 * @param pvUser2 The GIP.
1482 */
1483static DECLCALLBACK(void) supdrvGipInitOnCpu(RTCPUID idCpu, void *pvUser1, void *pvUser2)
1484{
1485 /* This is good enough, even though it will update some of the globals a
1486 bit to much. */
1487 supdrvGipMpEventOnlineOrInitOnCpu((PSUPDRVDEVEXT)pvUser1, idCpu);
1488}
1489
1490
1491/**
1492 * Callback used by supdrvDetermineAsyncTSC to read the TSC on a CPU.
1493 *
1494 * @param idCpu Ignored.
1495 * @param pvUser1 Where to put the TSC.
1496 * @param pvUser2 Ignored.
1497 */
1498static DECLCALLBACK(void) supdrvGipInitDetermineAsyncTscWorker(RTCPUID idCpu, void *pvUser1, void *pvUser2)
1499{
1500 ASMAtomicWriteU64((uint64_t volatile *)pvUser1, ASMReadTSC());
1501}
1502
1503
1504/**
1505 * Determine if Async GIP mode is required because of TSC drift.
1506 *
1507 * When using the default/normal timer code it is essential that the time stamp counter
1508 * (TSC) runs never backwards, that is, a read operation to the counter should return
1509 * a bigger value than any previous read operation. This is guaranteed by the latest
1510 * AMD CPUs and by newer Intel CPUs which never enter the C2 state (P4). In any other
1511 * case we have to choose the asynchronous timer mode.
1512 *
1513 * @param poffMin Pointer to the determined difference between different
1514 * cores (optional, can be NULL).
1515 * @return false if the time stamp counters appear to be synchronized, true otherwise.
1516 */
1517static bool supdrvGipInitDetermineAsyncTsc(uint64_t *poffMin)
1518{
1519 /*
1520 * Just iterate all the cpus 8 times and make sure that the TSC is
1521 * ever increasing. We don't bother taking TSC rollover into account.
1522 */
1523 int iEndCpu = RTMpGetArraySize();
1524 int iCpu;
1525 int cLoops = 8;
1526 bool fAsync = false;
1527 int rc = VINF_SUCCESS;
1528 uint64_t offMax = 0;
1529 uint64_t offMin = ~(uint64_t)0;
1530 uint64_t PrevTsc = ASMReadTSC();
1531
1532 while (cLoops-- > 0)
1533 {
1534 for (iCpu = 0; iCpu < iEndCpu; iCpu++)
1535 {
1536 uint64_t CurTsc;
1537 rc = RTMpOnSpecific(RTMpCpuIdFromSetIndex(iCpu), supdrvGipInitDetermineAsyncTscWorker, &CurTsc, NULL);
1538 if (RT_SUCCESS(rc))
1539 {
1540 if (CurTsc <= PrevTsc)
1541 {
1542 fAsync = true;
1543 offMin = offMax = PrevTsc - CurTsc;
1544 Log(("supdrvGipInitDetermineAsyncTsc: iCpu=%d cLoops=%d CurTsc=%llx PrevTsc=%llx\n",
1545 iCpu, cLoops, CurTsc, PrevTsc));
1546 break;
1547 }
1548
1549 /* Gather statistics (except the first time). */
1550 if (iCpu != 0 || cLoops != 7)
1551 {
1552 uint64_t off = CurTsc - PrevTsc;
1553 if (off < offMin)
1554 offMin = off;
1555 if (off > offMax)
1556 offMax = off;
1557 Log2(("%d/%d: off=%llx\n", cLoops, iCpu, off));
1558 }
1559
1560 /* Next */
1561 PrevTsc = CurTsc;
1562 }
1563 else if (rc == VERR_NOT_SUPPORTED)
1564 break;
1565 else
1566 AssertMsg(rc == VERR_CPU_NOT_FOUND || rc == VERR_CPU_OFFLINE, ("%d\n", rc));
1567 }
1568
1569 /* broke out of the loop. */
1570 if (iCpu < iEndCpu)
1571 break;
1572 }
1573
1574 if (poffMin)
1575 *poffMin = offMin; /* Almost RTMpOnSpecific profiling. */
1576 Log(("supdrvGipInitDetermineAsyncTsc: returns %d; iEndCpu=%d rc=%d offMin=%llx offMax=%llx\n",
1577 fAsync, iEndCpu, rc, offMin, offMax));
1578#if !defined(RT_OS_SOLARIS) && !defined(RT_OS_OS2) && !defined(RT_OS_WINDOWS)
1579 OSDBGPRINT(("vboxdrv: fAsync=%d offMin=%#lx offMax=%#lx\n", fAsync, (long)offMin, (long)offMax));
1580#endif
1581 return fAsync;
1582}
1583
1584
1585/**
1586 * supdrvGipInit() worker that determines the GIP TSC mode.
1587 *
1588 * @returns The most suitable TSC mode.
1589 * @param pDevExt Pointer to the device instance data.
1590 */
1591static SUPGIPMODE supdrvGipInitDetermineTscMode(PSUPDRVDEVEXT pDevExt)
1592{
1593 uint64_t u64DiffCoresIgnored;
1594 uint32_t uEAX, uEBX, uECX, uEDX;
1595
1596 /*
1597 * Establish whether the CPU advertises TSC as invariant, we need that in
1598 * a couple of places below.
1599 */
1600 bool fInvariantTsc = false;
1601 if (ASMHasCpuId())
1602 {
1603 uEAX = ASMCpuId_EAX(0x80000000);
1604 if (ASMIsValidExtRange(uEAX) && uEAX >= 0x80000007)
1605 {
1606 uEDX = ASMCpuId_EDX(0x80000007);
1607 if (uEDX & X86_CPUID_AMD_ADVPOWER_EDX_TSCINVAR)
1608 fInvariantTsc = true;
1609 }
1610 }
1611
1612 /*
1613 * On single CPU systems, we don't need to consider ASYNC mode.
1614 */
1615 if (RTMpGetCount() <= 1)
1616 return fInvariantTsc ? SUPGIPMODE_INVARIANT_TSC : SUPGIPMODE_SYNC_TSC;
1617
1618 /*
1619 * Allow the user and/or OS specific bits to force async mode.
1620 */
1621 if (supdrvOSGetForcedAsyncTscMode(pDevExt))
1622 return SUPGIPMODE_ASYNC_TSC;
1623
1624 /*
1625 * Use invariant mode if the CPU says TSC is invariant.
1626 */
1627 if (fInvariantTsc)
1628 return SUPGIPMODE_INVARIANT_TSC;
1629
1630 /*
1631 * TSC is not invariant and we're on SMP, this presents two problems:
1632 *
1633 * (1) There might be a skew between the CPU, so that cpu0
1634 * returns a TSC that is slightly different from cpu1.
1635 * This screw may be due to (2), bad TSC initialization
1636 * or slightly different TSC rates.
1637 *
1638 * (2) Power management (and other things) may cause the TSC
1639 * to run at a non-constant speed, and cause the speed
1640 * to be different on the cpus. This will result in (1).
1641 *
1642 * If any of the above is detected, we will have to use ASYNC mode.
1643 */
1644 /* (1). Try check for current differences between the cpus. */
1645 if (supdrvGipInitDetermineAsyncTsc(&u64DiffCoresIgnored))
1646 return SUPGIPMODE_ASYNC_TSC;
1647
1648 /* (2) If it's an AMD CPU with power management, we won't trust its TSC. */
1649 ASMCpuId(0, &uEAX, &uEBX, &uECX, &uEDX);
1650 if ( ASMIsValidStdRange(uEAX)
1651 && ASMIsAmdCpuEx(uEBX, uECX, uEDX))
1652 {
1653 /* Check for APM support. */
1654 uEAX = ASMCpuId_EAX(0x80000000);
1655 if (ASMIsValidExtRange(uEAX) && uEAX >= 0x80000007)
1656 {
1657 uEDX = ASMCpuId_EDX(0x80000007);
1658 if (uEDX & 0x3e) /* STC|TM|THERMTRIP|VID|FID. Ignore TS. */
1659 return SUPGIPMODE_ASYNC_TSC;
1660 }
1661 }
1662
1663 return SUPGIPMODE_SYNC_TSC;
1664}
1665
1666
1667/**
1668 * Initializes per-CPU GIP information.
1669 *
1670 * @param pGip Pointer to the GIP.
1671 * @param pCpu Pointer to which GIP CPU to initalize.
1672 * @param u64NanoTS The current nanosecond timestamp.
1673 * @param uCpuHz The CPU frequency to set, 0 if the caller doesn't know.
1674 */
1675static void supdrvGipInitCpu(PSUPGLOBALINFOPAGE pGip, PSUPGIPCPU pCpu, uint64_t u64NanoTS, uint64_t uCpuHz)
1676{
1677 pCpu->u32TransactionId = 2;
1678 pCpu->u64NanoTS = u64NanoTS;
1679 pCpu->u64TSC = ASMReadTSC();
1680 pCpu->u64TSCSample = GIP_TSC_DELTA_RSVD;
1681 pCpu->i64TSCDelta = pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED ? INT64_MAX : 0;
1682
1683 ASMAtomicWriteSize(&pCpu->enmState, SUPGIPCPUSTATE_INVALID);
1684 ASMAtomicWriteSize(&pCpu->idCpu, NIL_RTCPUID);
1685 ASMAtomicWriteS16(&pCpu->iCpuSet, -1);
1686 ASMAtomicWriteU16(&pCpu->idApic, UINT16_MAX);
1687
1688 /*
1689 * The first time we're called, we don't have a CPU frequency handy,
1690 * so pretend it's a 4 GHz CPU. On CPUs that are online, we'll get
1691 * called again and at that point we have a more plausible CPU frequency
1692 * value handy. The frequency history will also be adjusted again on
1693 * the 2nd timer callout (maybe we can skip that now?).
1694 */
1695 if (!uCpuHz)
1696 {
1697 pCpu->u64CpuHz = _4G - 1;
1698 pCpu->u32UpdateIntervalTSC = (uint32_t)((_4G - 1) / pGip->u32UpdateHz);
1699 }
1700 else
1701 {
1702 pCpu->u64CpuHz = uCpuHz;
1703 pCpu->u32UpdateIntervalTSC = (uint32_t)(uCpuHz / pGip->u32UpdateHz);
1704 }
1705 pCpu->au32TSCHistory[0]
1706 = pCpu->au32TSCHistory[1]
1707 = pCpu->au32TSCHistory[2]
1708 = pCpu->au32TSCHistory[3]
1709 = pCpu->au32TSCHistory[4]
1710 = pCpu->au32TSCHistory[5]
1711 = pCpu->au32TSCHistory[6]
1712 = pCpu->au32TSCHistory[7]
1713 = pCpu->u32UpdateIntervalTSC;
1714}
1715
1716
1717/**
1718 * Initializes the GIP data.
1719 *
1720 * @param pDevExt Pointer to the device instance data.
1721 * @param pGip Pointer to the read-write kernel mapping of the GIP.
1722 * @param HCPhys The physical address of the GIP.
1723 * @param u64NanoTS The current nanosecond timestamp.
1724 * @param uUpdateHz The update frequency.
1725 * @param uUpdateIntervalNS The update interval in nanoseconds.
1726 * @param cCpus The CPU count.
1727 */
1728static void supdrvGipInit(PSUPDRVDEVEXT pDevExt, PSUPGLOBALINFOPAGE pGip, RTHCPHYS HCPhys,
1729 uint64_t u64NanoTS, unsigned uUpdateHz, unsigned uUpdateIntervalNS, unsigned cCpus)
1730{
1731 size_t const cbGip = RT_ALIGN_Z(RT_OFFSETOF(SUPGLOBALINFOPAGE, aCPUs[cCpus]), PAGE_SIZE);
1732 unsigned i;
1733#ifdef DEBUG_DARWIN_GIP
1734 OSDBGPRINT(("supdrvGipInit: pGip=%p HCPhys=%lx u64NanoTS=%llu uUpdateHz=%d cCpus=%u\n", pGip, (long)HCPhys, u64NanoTS, uUpdateHz, cCpus));
1735#else
1736 LogFlow(("supdrvGipInit: pGip=%p HCPhys=%lx u64NanoTS=%llu uUpdateHz=%d cCpus=%u\n", pGip, (long)HCPhys, u64NanoTS, uUpdateHz, cCpus));
1737#endif
1738
1739 /*
1740 * Initialize the structure.
1741 */
1742 memset(pGip, 0, cbGip);
1743
1744 pGip->u32Magic = SUPGLOBALINFOPAGE_MAGIC;
1745 pGip->u32Version = SUPGLOBALINFOPAGE_VERSION;
1746 pGip->u32Mode = supdrvGipInitDetermineTscMode(pDevExt);
1747 if ( pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC
1748 /*|| pGip->u32Mode == SUPGIPMODE_SYNC_TSC */)
1749 pGip->enmUseTscDelta = supdrvOSAreTscDeltasInSync() /* Allow OS override (windows). */
1750 ? SUPGIPUSETSCDELTA_ZERO_CLAIMED : SUPGIPUSETSCDELTA_PRACTICALLY_ZERO /* downgrade later */;
1751 else
1752 pGip->enmUseTscDelta = SUPGIPUSETSCDELTA_NOT_APPLICABLE;
1753 pGip->cCpus = (uint16_t)cCpus;
1754 pGip->cPages = (uint16_t)(cbGip / PAGE_SIZE);
1755 pGip->u32UpdateHz = uUpdateHz;
1756 pGip->u32UpdateIntervalNS = uUpdateIntervalNS;
1757 pGip->fGetGipCpu = SUPGIPGETCPU_APIC_ID;
1758 RTCpuSetEmpty(&pGip->OnlineCpuSet);
1759 RTCpuSetEmpty(&pGip->PresentCpuSet);
1760 RTMpGetSet(&pGip->PossibleCpuSet);
1761 pGip->cOnlineCpus = RTMpGetOnlineCount();
1762 pGip->cPresentCpus = RTMpGetPresentCount();
1763 pGip->cPossibleCpus = RTMpGetCount();
1764 pGip->idCpuMax = RTMpGetMaxCpuId();
1765 for (i = 0; i < RT_ELEMENTS(pGip->aiCpuFromApicId); i++)
1766 pGip->aiCpuFromApicId[i] = UINT16_MAX;
1767 for (i = 0; i < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx); i++)
1768 pGip->aiCpuFromCpuSetIdx[i] = UINT16_MAX;
1769 for (i = 0; i < cCpus; i++)
1770 supdrvGipInitCpu(pGip, &pGip->aCPUs[i], u64NanoTS, 0 /*uCpuHz*/);
1771
1772 /*
1773 * Link it to the device extension.
1774 */
1775 pDevExt->pGip = pGip;
1776 pDevExt->HCPhysGip = HCPhys;
1777 pDevExt->cGipUsers = 0;
1778}
1779
1780
1781/**
1782 * Creates the GIP.
1783 *
1784 * @returns VBox status code.
1785 * @param pDevExt Instance data. GIP stuff may be updated.
1786 */
1787int VBOXCALL supdrvGipCreate(PSUPDRVDEVEXT pDevExt)
1788{
1789 PSUPGLOBALINFOPAGE pGip;
1790 RTHCPHYS HCPhysGip;
1791 uint32_t u32SystemResolution;
1792 uint32_t u32Interval;
1793 uint32_t u32MinInterval;
1794 uint32_t uMod;
1795 unsigned cCpus;
1796 int rc;
1797
1798 LogFlow(("supdrvGipCreate:\n"));
1799
1800 /* Assert order. */
1801 Assert(pDevExt->u32SystemTimerGranularityGrant == 0);
1802 Assert(pDevExt->GipMemObj == NIL_RTR0MEMOBJ);
1803 Assert(!pDevExt->pGipTimer);
1804
1805 /*
1806 * Check the CPU count.
1807 */
1808 cCpus = RTMpGetArraySize();
1809 if ( cCpus > RTCPUSET_MAX_CPUS
1810 || cCpus > 256 /* ApicId is used for the mappings */)
1811 {
1812 SUPR0Printf("VBoxDrv: Too many CPUs (%u) for the GIP (max %u)\n", cCpus, RT_MIN(RTCPUSET_MAX_CPUS, 256));
1813 return VERR_TOO_MANY_CPUS;
1814 }
1815
1816 /*
1817 * Allocate a contiguous set of pages with a default kernel mapping.
1818 */
1819 rc = RTR0MemObjAllocCont(&pDevExt->GipMemObj, RT_UOFFSETOF(SUPGLOBALINFOPAGE, aCPUs[cCpus]), false /*fExecutable*/);
1820 if (RT_FAILURE(rc))
1821 {
1822 OSDBGPRINT(("supdrvGipCreate: failed to allocate the GIP page. rc=%d\n", rc));
1823 return rc;
1824 }
1825 pGip = (PSUPGLOBALINFOPAGE)RTR0MemObjAddress(pDevExt->GipMemObj); AssertPtr(pGip);
1826 HCPhysGip = RTR0MemObjGetPagePhysAddr(pDevExt->GipMemObj, 0); Assert(HCPhysGip != NIL_RTHCPHYS);
1827
1828 /*
1829 * Allocate the TSC-delta sync struct on a separate cache line.
1830 */
1831 pDevExt->pvTscDeltaSync = RTMemAllocZ(sizeof(SUPTSCDELTASYNC) + 63);
1832 pDevExt->pTscDeltaSync = RT_ALIGN_PT(pDevExt->pvTscDeltaSync, 64, PSUPTSCDELTASYNC);
1833 Assert(RT_ALIGN_PT(pDevExt->pTscDeltaSync, 64, PSUPTSCDELTASYNC) == pDevExt->pTscDeltaSync);
1834
1835 /*
1836 * Find a reasonable update interval and initialize the structure.
1837 */
1838 supdrvGipRequestHigherTimerFrequencyFromSystem(pDevExt);
1839 /** @todo figure out why using a 100Ms interval upsets timekeeping in VMs.
1840 * See @bugref{6710}. */
1841 u32MinInterval = RT_NS_10MS;
1842 u32SystemResolution = RTTimerGetSystemGranularity();
1843 u32Interval = u32MinInterval;
1844 uMod = u32MinInterval % u32SystemResolution;
1845 if (uMod)
1846 u32Interval += u32SystemResolution - uMod;
1847
1848 supdrvGipInit(pDevExt, pGip, HCPhysGip, RTTimeSystemNanoTS(), RT_NS_1SEC / u32Interval /*=Hz*/, u32Interval, cCpus);
1849
1850 /*
1851 * Important sanity check...
1852 */
1853 if (RT_UNLIKELY( pGip->enmUseTscDelta == SUPGIPUSETSCDELTA_ZERO_CLAIMED
1854 && pGip->u32Mode == SUPGIPMODE_ASYNC_TSC
1855 && !supdrvOSGetForcedAsyncTscMode(pDevExt)))
1856 {
1857 /* Basically, invariant Windows boxes, should never be detected as async (i.e. TSC-deltas should be 0). */
1858 OSDBGPRINT(("supdrvGipCreate: The TSC-deltas should be normalized by the host OS, but verifying shows it's not!\n"));
1859 return VERR_INTERNAL_ERROR_2;
1860 }
1861
1862 /*
1863 * Do the TSC frequency measurements.
1864 *
1865 * If we're in invariant TSC mode, just to a quick preliminary measurement
1866 * that the TSC-delta measurement code can use to yield cross calls.
1867 *
1868 * If we're in any of the other two modes, neither which require MP init,
1869 * notifications or deltas for the job, do the full measurement now so
1870 * that supdrvGipInitOnCpu can populate the TSC interval and history
1871 * array with more reasonable values.
1872 */
1873 if (pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC)
1874 {
1875 rc = supdrvGipInitMeasureTscFreq(pDevExt, pGip, true /*fRough*/); /* cannot fail */
1876 supdrvGipInitStartTimerForRefiningInvariantTscFreq(pDevExt, pGip);
1877 }
1878 else
1879 rc = supdrvGipInitMeasureTscFreq(pDevExt, pGip, false /*fRough*/);
1880 if (RT_SUCCESS(rc))
1881 {
1882 /*
1883 * Start TSC-delta measurement thread before we start getting MP
1884 * events that will try kick it into action (includes the
1885 * RTMpOnAll/supdrvGipInitOnCpu call below).
1886 */
1887 RTCpuSetEmpty(&pDevExt->TscDeltaCpuSet);
1888 RTCpuSetEmpty(&pDevExt->TscDeltaObtainedCpuSet);
1889#ifdef SUPDRV_USE_TSC_DELTA_THREAD
1890 if ( pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED
1891 && pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC)
1892 rc = supdrvTscDeltaThreadInit(pDevExt);
1893#endif
1894 if (RT_SUCCESS(rc))
1895 {
1896 rc = RTMpNotificationRegister(supdrvGipMpEvent, pDevExt);
1897 if (RT_SUCCESS(rc))
1898 {
1899 /*
1900 * Do GIP initialization on all online CPUs. Wake up the
1901 * TSC-delta thread afterwards.
1902 */
1903 rc = RTMpOnAll(supdrvGipInitOnCpu, pDevExt, pGip);
1904 if (RT_SUCCESS(rc))
1905 {
1906#ifdef SUPDRV_USE_TSC_DELTA_THREAD
1907 if (pDevExt->hTscDeltaThread != NIL_RTTHREAD)
1908 RTThreadUserSignal(pDevExt->hTscDeltaThread);
1909#else
1910 uint16_t iCpu;
1911 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
1912 {
1913 /*
1914 * Measure the TSC deltas now that we have MP notifications.
1915 */
1916 int cTries = 5;
1917 do
1918 {
1919 rc = supdrvMeasureInitialTscDeltas(pDevExt);
1920 if ( rc != VERR_TRY_AGAIN
1921 && rc != VERR_CPU_OFFLINE)
1922 break;
1923 } while (--cTries > 0);
1924 for (iCpu = 0; iCpu < pGip->cCpus; iCpu++)
1925 Log(("supdrvTscDeltaInit: cpu[%u] delta %lld\n", iCpu, pGip->aCPUs[iCpu].i64TSCDelta));
1926 }
1927 else
1928 {
1929 for (iCpu = 0; iCpu < pGip->cCpus; iCpu++)
1930 AssertMsg(!pGip->aCPUs[iCpu].i64TSCDelta, ("iCpu=%u %lld mode=%d\n", iCpu, pGip->aCPUs[iCpu].i64TSCDelta, pGip->u32Mode));
1931 }
1932 if (RT_SUCCESS(rc))
1933#endif
1934 {
1935 /*
1936 * Create the timer.
1937 * If CPU_ALL isn't supported we'll have to fall back to synchronous mode.
1938 */
1939 if (pGip->u32Mode == SUPGIPMODE_ASYNC_TSC)
1940 {
1941 rc = RTTimerCreateEx(&pDevExt->pGipTimer, u32Interval, RTTIMER_FLAGS_CPU_ALL,
1942 supdrvGipAsyncTimer, pDevExt);
1943 if (rc == VERR_NOT_SUPPORTED)
1944 {
1945 OSDBGPRINT(("supdrvGipCreate: omni timer not supported, falling back to synchronous mode\n"));
1946 pGip->u32Mode = SUPGIPMODE_SYNC_TSC;
1947 }
1948 }
1949 if (pGip->u32Mode != SUPGIPMODE_ASYNC_TSC)
1950 rc = RTTimerCreateEx(&pDevExt->pGipTimer, u32Interval, 0 /* fFlags */,
1951 supdrvGipSyncAndInvariantTimer, pDevExt);
1952 if (RT_SUCCESS(rc))
1953 {
1954 /*
1955 * We're good.
1956 */
1957 Log(("supdrvGipCreate: %u ns interval.\n", u32Interval));
1958 supdrvGipReleaseHigherTimerFrequencyFromSystem(pDevExt);
1959
1960 g_pSUPGlobalInfoPage = pGip;
1961 return VINF_SUCCESS;
1962 }
1963
1964 OSDBGPRINT(("supdrvGipCreate: failed create GIP timer at %u ns interval. rc=%Rrc\n", u32Interval, rc));
1965 Assert(!pDevExt->pGipTimer);
1966 }
1967 }
1968 else
1969 OSDBGPRINT(("supdrvGipCreate: RTMpOnAll failed. rc=%Rrc\n", rc));
1970 }
1971 else
1972 OSDBGPRINT(("supdrvGipCreate: failed to register MP event notfication. rc=%Rrc\n", rc));
1973 }
1974 else
1975 OSDBGPRINT(("supdrvGipCreate: supdrvTscDeltaInit failed. rc=%Rrc\n", rc));
1976 }
1977 else
1978 OSDBGPRINT(("supdrvGipCreate: supdrvMeasureInitialTscDeltas failed. rc=%Rrc\n", rc));
1979
1980 /* Releases timer frequency increase too. */
1981 supdrvGipDestroy(pDevExt);
1982 return rc;
1983}
1984
1985
1986/**
1987 * Invalidates the GIP data upon termination.
1988 *
1989 * @param pGip Pointer to the read-write kernel mapping of the GIP.
1990 */
1991static void supdrvGipTerm(PSUPGLOBALINFOPAGE pGip)
1992{
1993 unsigned i;
1994 pGip->u32Magic = 0;
1995 for (i = 0; i < pGip->cCpus; i++)
1996 {
1997 pGip->aCPUs[i].u64NanoTS = 0;
1998 pGip->aCPUs[i].u64TSC = 0;
1999 pGip->aCPUs[i].iTSCHistoryHead = 0;
2000 pGip->aCPUs[i].u64TSCSample = 0;
2001 pGip->aCPUs[i].i64TSCDelta = INT64_MAX;
2002 }
2003}
2004
2005
2006/**
2007 * Terminates the GIP.
2008 *
2009 * @param pDevExt Instance data. GIP stuff may be updated.
2010 */
2011void VBOXCALL supdrvGipDestroy(PSUPDRVDEVEXT pDevExt)
2012{
2013 int rc;
2014#ifdef DEBUG_DARWIN_GIP
2015 OSDBGPRINT(("supdrvGipDestroy: pDevExt=%p pGip=%p pGipTimer=%p GipMemObj=%p\n", pDevExt,
2016 pDevExt->GipMemObj != NIL_RTR0MEMOBJ ? RTR0MemObjAddress(pDevExt->GipMemObj) : NULL,
2017 pDevExt->pGipTimer, pDevExt->GipMemObj));
2018#endif
2019
2020 /*
2021 * Stop receiving MP notifications before tearing anything else down.
2022 */
2023 RTMpNotificationDeregister(supdrvGipMpEvent, pDevExt);
2024
2025#ifdef SUPDRV_USE_TSC_DELTA_THREAD
2026 /*
2027 * Terminate the TSC-delta measurement thread and resources.
2028 */
2029 supdrvTscDeltaTerm(pDevExt);
2030#endif
2031
2032 /*
2033 * Destroy the TSC-refinement timer.
2034 */
2035 if (pDevExt->pInvarTscRefineTimer)
2036 {
2037 RTTimerDestroy(pDevExt->pInvarTscRefineTimer);
2038 pDevExt->pInvarTscRefineTimer = NULL;
2039 }
2040
2041 if (pDevExt->pvTscDeltaSync)
2042 {
2043 RTMemFree(pDevExt->pvTscDeltaSync);
2044 pDevExt->pTscDeltaSync = NULL;
2045 pDevExt->pvTscDeltaSync = NULL;
2046 }
2047
2048 /*
2049 * Invalid the GIP data.
2050 */
2051 if (pDevExt->pGip)
2052 {
2053 supdrvGipTerm(pDevExt->pGip);
2054 pDevExt->pGip = NULL;
2055 }
2056 g_pSUPGlobalInfoPage = NULL;
2057
2058 /*
2059 * Destroy the timer and free the GIP memory object.
2060 */
2061 if (pDevExt->pGipTimer)
2062 {
2063 rc = RTTimerDestroy(pDevExt->pGipTimer); AssertRC(rc);
2064 pDevExt->pGipTimer = NULL;
2065 }
2066
2067 if (pDevExt->GipMemObj != NIL_RTR0MEMOBJ)
2068 {
2069 rc = RTR0MemObjFree(pDevExt->GipMemObj, true /* free mappings */); AssertRC(rc);
2070 pDevExt->GipMemObj = NIL_RTR0MEMOBJ;
2071 }
2072
2073 /*
2074 * Finally, make sure we've release the system timer resolution request
2075 * if one actually succeeded and is still pending.
2076 */
2077 supdrvGipReleaseHigherTimerFrequencyFromSystem(pDevExt);
2078}
2079
2080
2081
2082
2083/*
2084 *
2085 *
2086 * GIP Update Timer Related Code
2087 * GIP Update Timer Related Code
2088 * GIP Update Timer Related Code
2089 *
2090 *
2091 */
2092
2093
2094/**
2095 * Worker routine for supdrvGipUpdate() and supdrvGipUpdatePerCpu() that
2096 * updates all the per cpu data except the transaction id.
2097 *
2098 * @param pDevExt The device extension.
2099 * @param pGipCpu Pointer to the per cpu data.
2100 * @param u64NanoTS The current time stamp.
2101 * @param u64TSC The current TSC.
2102 * @param iTick The current timer tick.
2103 *
2104 * @remarks Can be called with interrupts disabled!
2105 */
2106static void supdrvGipDoUpdateCpu(PSUPDRVDEVEXT pDevExt, PSUPGIPCPU pGipCpu, uint64_t u64NanoTS, uint64_t u64TSC, uint64_t iTick)
2107{
2108 uint64_t u64TSCDelta;
2109 uint32_t u32UpdateIntervalTSC;
2110 uint32_t u32UpdateIntervalTSCSlack;
2111 unsigned iTSCHistoryHead;
2112 uint64_t u64CpuHz;
2113 uint32_t u32TransactionId;
2114
2115 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
2116 AssertPtrReturnVoid(pGip);
2117
2118 /* Delta between this and the previous update. */
2119 ASMAtomicUoWriteU32(&pGipCpu->u32PrevUpdateIntervalNS, (uint32_t)(u64NanoTS - pGipCpu->u64NanoTS));
2120
2121 /*
2122 * Update the NanoTS.
2123 */
2124 ASMAtomicWriteU64(&pGipCpu->u64NanoTS, u64NanoTS);
2125
2126 /*
2127 * Calc TSC delta.
2128 */
2129 u64TSCDelta = u64TSC - pGipCpu->u64TSC;
2130 ASMAtomicWriteU64(&pGipCpu->u64TSC, u64TSC);
2131
2132 /* We don't need to keep realculating the frequency when it's invariant. */
2133 if (pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC)
2134 return;
2135
2136 if (u64TSCDelta >> 32)
2137 {
2138 u64TSCDelta = pGipCpu->u32UpdateIntervalTSC;
2139 pGipCpu->cErrors++;
2140 }
2141
2142 /*
2143 * On the 2nd and 3rd callout, reset the history with the current TSC
2144 * interval since the values entered by supdrvGipInit are totally off.
2145 * The interval on the 1st callout completely unreliable, the 2nd is a bit
2146 * better, while the 3rd should be most reliable.
2147 */
2148 u32TransactionId = pGipCpu->u32TransactionId;
2149 if (RT_UNLIKELY( ( u32TransactionId == 5
2150 || u32TransactionId == 7)
2151 && ( iTick == 2
2152 || iTick == 3) ))
2153 {
2154 unsigned i;
2155 for (i = 0; i < RT_ELEMENTS(pGipCpu->au32TSCHistory); i++)
2156 ASMAtomicUoWriteU32(&pGipCpu->au32TSCHistory[i], (uint32_t)u64TSCDelta);
2157 }
2158
2159 /*
2160 * Validate the NanoTS deltas between timer fires with an arbitrary threshold of 0.5%.
2161 * Wait until we have at least one full history since the above history reset. The
2162 * assumption is that the majority of the previous history values will be tolerable.
2163 * See @bugref{6710} comment #67.
2164 */
2165 if ( u32TransactionId > 23 /* 7 + (8 * 2) */
2166 && pGip->u32Mode != SUPGIPMODE_ASYNC_TSC)
2167 {
2168 uint32_t uNanoTsThreshold = pGip->u32UpdateIntervalNS / 200;
2169 if ( pGipCpu->u32PrevUpdateIntervalNS > pGip->u32UpdateIntervalNS + uNanoTsThreshold
2170 || pGipCpu->u32PrevUpdateIntervalNS < pGip->u32UpdateIntervalNS - uNanoTsThreshold)
2171 {
2172 uint32_t u32;
2173 u32 = pGipCpu->au32TSCHistory[0];
2174 u32 += pGipCpu->au32TSCHistory[1];
2175 u32 += pGipCpu->au32TSCHistory[2];
2176 u32 += pGipCpu->au32TSCHistory[3];
2177 u32 >>= 2;
2178 u64TSCDelta = pGipCpu->au32TSCHistory[4];
2179 u64TSCDelta += pGipCpu->au32TSCHistory[5];
2180 u64TSCDelta += pGipCpu->au32TSCHistory[6];
2181 u64TSCDelta += pGipCpu->au32TSCHistory[7];
2182 u64TSCDelta >>= 2;
2183 u64TSCDelta += u32;
2184 u64TSCDelta >>= 1;
2185 }
2186 }
2187
2188 /*
2189 * TSC History.
2190 */
2191 Assert(RT_ELEMENTS(pGipCpu->au32TSCHistory) == 8);
2192 iTSCHistoryHead = (pGipCpu->iTSCHistoryHead + 1) & 7;
2193 ASMAtomicWriteU32(&pGipCpu->iTSCHistoryHead, iTSCHistoryHead);
2194 ASMAtomicWriteU32(&pGipCpu->au32TSCHistory[iTSCHistoryHead], (uint32_t)u64TSCDelta);
2195
2196 /*
2197 * UpdateIntervalTSC = average of last 8,2,1 intervals depending on update HZ.
2198 *
2199 * On Windows, we have an occasional (but recurring) sour value that messed up
2200 * the history but taking only 1 interval reduces the precision overall.
2201 * However, this problem existed before the invariant mode was introduced.
2202 */
2203 if ( pGip->u32Mode == SUPGIPMODE_INVARIANT_TSC
2204 || pGip->u32UpdateHz >= 1000)
2205 {
2206 uint32_t u32;
2207 u32 = pGipCpu->au32TSCHistory[0];
2208 u32 += pGipCpu->au32TSCHistory[1];
2209 u32 += pGipCpu->au32TSCHistory[2];
2210 u32 += pGipCpu->au32TSCHistory[3];
2211 u32 >>= 2;
2212 u32UpdateIntervalTSC = pGipCpu->au32TSCHistory[4];
2213 u32UpdateIntervalTSC += pGipCpu->au32TSCHistory[5];
2214 u32UpdateIntervalTSC += pGipCpu->au32TSCHistory[6];
2215 u32UpdateIntervalTSC += pGipCpu->au32TSCHistory[7];
2216 u32UpdateIntervalTSC >>= 2;
2217 u32UpdateIntervalTSC += u32;
2218 u32UpdateIntervalTSC >>= 1;
2219
2220 /* Value chosen for a 2GHz Athlon64 running linux 2.6.10/11. */
2221 u32UpdateIntervalTSCSlack = u32UpdateIntervalTSC >> 14;
2222 }
2223 else if (pGip->u32UpdateHz >= 90)
2224 {
2225 u32UpdateIntervalTSC = (uint32_t)u64TSCDelta;
2226 u32UpdateIntervalTSC += pGipCpu->au32TSCHistory[(iTSCHistoryHead - 1) & 7];
2227 u32UpdateIntervalTSC >>= 1;
2228
2229 /* value chosen on a 2GHz thinkpad running windows */
2230 u32UpdateIntervalTSCSlack = u32UpdateIntervalTSC >> 7;
2231 }
2232 else
2233 {
2234 u32UpdateIntervalTSC = (uint32_t)u64TSCDelta;
2235
2236 /* This value hasn't be checked yet.. waiting for OS/2 and 33Hz timers.. :-) */
2237 u32UpdateIntervalTSCSlack = u32UpdateIntervalTSC >> 6;
2238 }
2239 ASMAtomicWriteU32(&pGipCpu->u32UpdateIntervalTSC, u32UpdateIntervalTSC + u32UpdateIntervalTSCSlack);
2240
2241 /*
2242 * CpuHz.
2243 */
2244 u64CpuHz = ASMMult2xU32RetU64(u32UpdateIntervalTSC, RT_NS_1SEC);
2245 u64CpuHz /= pGip->u32UpdateIntervalNS;
2246 ASMAtomicWriteU64(&pGipCpu->u64CpuHz, u64CpuHz);
2247}
2248
2249
2250/**
2251 * Updates the GIP.
2252 *
2253 * @param pDevExt The device extension.
2254 * @param u64NanoTS The current nanosecond timesamp.
2255 * @param u64TSC The current TSC timesamp.
2256 * @param idCpu The CPU ID.
2257 * @param iTick The current timer tick.
2258 *
2259 * @remarks Can be called with interrupts disabled!
2260 */
2261static void supdrvGipUpdate(PSUPDRVDEVEXT pDevExt, uint64_t u64NanoTS, uint64_t u64TSC, RTCPUID idCpu, uint64_t iTick)
2262{
2263 /*
2264 * Determine the relevant CPU data.
2265 */
2266 PSUPGIPCPU pGipCpu;
2267 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
2268 AssertPtrReturnVoid(pGip);
2269
2270 if (pGip->u32Mode != SUPGIPMODE_ASYNC_TSC)
2271 pGipCpu = &pGip->aCPUs[0];
2272 else
2273 {
2274 unsigned iCpu = pGip->aiCpuFromApicId[ASMGetApicId()];
2275 if (RT_UNLIKELY(iCpu >= pGip->cCpus))
2276 return;
2277 pGipCpu = &pGip->aCPUs[iCpu];
2278 if (RT_UNLIKELY(pGipCpu->idCpu != idCpu))
2279 return;
2280 }
2281
2282 /*
2283 * Start update transaction.
2284 */
2285 if (!(ASMAtomicIncU32(&pGipCpu->u32TransactionId) & 1))
2286 {
2287 /* this can happen on win32 if we're taking to long and there are more CPUs around. shouldn't happen though. */
2288 AssertMsgFailed(("Invalid transaction id, %#x, not odd!\n", pGipCpu->u32TransactionId));
2289 ASMAtomicIncU32(&pGipCpu->u32TransactionId);
2290 pGipCpu->cErrors++;
2291 return;
2292 }
2293
2294 /*
2295 * Recalc the update frequency every 0x800th time.
2296 */
2297 if ( pGip->u32Mode != SUPGIPMODE_INVARIANT_TSC /* cuz we're not recalculating the frequency on invariants hosts. */
2298 && !(pGipCpu->u32TransactionId & (GIP_UPDATEHZ_RECALC_FREQ * 2 - 2)))
2299 {
2300 if (pGip->u64NanoTSLastUpdateHz)
2301 {
2302#ifdef RT_ARCH_AMD64 /** @todo fix 64-bit div here to work on x86 linux. */
2303 uint64_t u64Delta = u64NanoTS - pGip->u64NanoTSLastUpdateHz;
2304 uint32_t u32UpdateHz = (uint32_t)((RT_NS_1SEC_64 * GIP_UPDATEHZ_RECALC_FREQ) / u64Delta);
2305 if (u32UpdateHz <= 2000 && u32UpdateHz >= 30)
2306 {
2307 /** @todo r=ramshankar: Changing u32UpdateHz might screw up TSC frequency
2308 * calculation on non-invariant hosts if it changes the history decision
2309 * taken in supdrvGipDoUpdateCpu(). */
2310 uint64_t u64Interval = u64Delta / GIP_UPDATEHZ_RECALC_FREQ;
2311 ASMAtomicWriteU32(&pGip->u32UpdateHz, u32UpdateHz);
2312 ASMAtomicWriteU32(&pGip->u32UpdateIntervalNS, (uint32_t)u64Interval);
2313 }
2314#endif
2315 }
2316 ASMAtomicWriteU64(&pGip->u64NanoTSLastUpdateHz, u64NanoTS | 1);
2317 }
2318
2319 /*
2320 * Update the data.
2321 */
2322 supdrvGipDoUpdateCpu(pDevExt, pGipCpu, u64NanoTS, u64TSC, iTick);
2323
2324 /*
2325 * Complete transaction.
2326 */
2327 ASMAtomicIncU32(&pGipCpu->u32TransactionId);
2328}
2329
2330
2331/**
2332 * Updates the per cpu GIP data for the calling cpu.
2333 *
2334 * @param pDevExt The device extension.
2335 * @param u64NanoTS The current nanosecond timesamp.
2336 * @param u64TSC The current TSC timesamp.
2337 * @param idCpu The CPU ID.
2338 * @param idApic The APIC id for the CPU index.
2339 * @param iTick The current timer tick.
2340 *
2341 * @remarks Can be called with interrupts disabled!
2342 */
2343static void supdrvGipUpdatePerCpu(PSUPDRVDEVEXT pDevExt, uint64_t u64NanoTS, uint64_t u64TSC,
2344 RTCPUID idCpu, uint8_t idApic, uint64_t iTick)
2345{
2346 uint32_t iCpu;
2347 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
2348
2349 /*
2350 * Avoid a potential race when a CPU online notification doesn't fire on
2351 * the onlined CPU but the tick creeps in before the event notification is
2352 * run.
2353 */
2354 if (RT_UNLIKELY(iTick == 1))
2355 {
2356 iCpu = supdrvGipFindOrAllocCpuIndexForCpuId(pGip, idCpu);
2357 if (pGip->aCPUs[iCpu].enmState == SUPGIPCPUSTATE_OFFLINE)
2358 supdrvGipMpEventOnlineOrInitOnCpu(pDevExt, idCpu);
2359 }
2360
2361 iCpu = pGip->aiCpuFromApicId[idApic];
2362 if (RT_LIKELY(iCpu < pGip->cCpus))
2363 {
2364 PSUPGIPCPU pGipCpu = &pGip->aCPUs[iCpu];
2365 if (pGipCpu->idCpu == idCpu)
2366 {
2367 /*
2368 * Start update transaction.
2369 */
2370 if (!(ASMAtomicIncU32(&pGipCpu->u32TransactionId) & 1))
2371 {
2372 AssertMsgFailed(("Invalid transaction id, %#x, not odd!\n", pGipCpu->u32TransactionId));
2373 ASMAtomicIncU32(&pGipCpu->u32TransactionId);
2374 pGipCpu->cErrors++;
2375 return;
2376 }
2377
2378 /*
2379 * Update the data.
2380 */
2381 supdrvGipDoUpdateCpu(pDevExt, pGipCpu, u64NanoTS, u64TSC, iTick);
2382
2383 /*
2384 * Complete transaction.
2385 */
2386 ASMAtomicIncU32(&pGipCpu->u32TransactionId);
2387 }
2388 }
2389}
2390
2391
2392/**
2393 * Timer callback function for the sync and invariant GIP modes.
2394 *
2395 * @param pTimer The timer.
2396 * @param pvUser Opaque pointer to the device extension.
2397 * @param iTick The timer tick.
2398 */
2399static DECLCALLBACK(void) supdrvGipSyncAndInvariantTimer(PRTTIMER pTimer, void *pvUser, uint64_t iTick)
2400{
2401 RTCCUINTREG uFlags;
2402 uint64_t u64TSC;
2403 uint64_t u64NanoTS;
2404 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser;
2405 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
2406
2407 uFlags = ASMIntDisableFlags(); /* No interruptions please (real problem on S10). */
2408 u64TSC = ASMReadTSC();
2409 u64NanoTS = RTTimeSystemNanoTS();
2410
2411 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_PRACTICALLY_ZERO)
2412 {
2413 /*
2414 * The calculations in supdrvGipUpdate() is very timing sensitive and doesn't handle
2415 * missed timer ticks. So for now it is better to use a delta of 0 and have the TSC rate
2416 * affected a bit until we get proper TSC deltas than implementing options like
2417 * rescheduling the tick to be delivered on the right CPU or missing the tick entirely.
2418 *
2419 * The likely hood of this happening is really low. On Windows, Linux, and Solaris
2420 * timers fire on the CPU they were registered/started on. Darwin timers doesn't
2421 * necessarily (they are high priority threads waiting).
2422 */
2423 Assert(!ASMIntAreEnabled());
2424 supdrvTscDeltaApply(pGip, &u64TSC, ASMGetApicId(), NULL /* pfDeltaApplied */);
2425 }
2426
2427 supdrvGipUpdate(pDevExt, u64NanoTS, u64TSC, NIL_RTCPUID, iTick);
2428
2429 ASMSetFlags(uFlags);
2430
2431#ifdef SUPDRV_USE_TSC_DELTA_THREAD
2432 if ( pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED
2433 && !RTCpuSetIsEmpty(&pDevExt->TscDeltaCpuSet))
2434 {
2435 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
2436 if ( pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_Listening
2437 || pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_Measuring)
2438 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_WaitAndMeasure;
2439 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
2440 /** @todo Do the actual poking using -- RTThreadUserSignal() */
2441 }
2442#endif
2443}
2444
2445
2446/**
2447 * Timer callback function for async GIP mode.
2448 * @param pTimer The timer.
2449 * @param pvUser Opaque pointer to the device extension.
2450 * @param iTick The timer tick.
2451 */
2452static DECLCALLBACK(void) supdrvGipAsyncTimer(PRTTIMER pTimer, void *pvUser, uint64_t iTick)
2453{
2454 RTCCUINTREG fOldFlags = ASMIntDisableFlags(); /* No interruptions please (real problem on S10). */
2455 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser;
2456 RTCPUID idCpu = RTMpCpuId();
2457 uint64_t u64TSC = ASMReadTSC();
2458 uint64_t NanoTS = RTTimeSystemNanoTS();
2459
2460 /** @todo reset the transaction number and whatnot when iTick == 1. */
2461 if (pDevExt->idGipMaster == idCpu)
2462 supdrvGipUpdate(pDevExt, NanoTS, u64TSC, idCpu, iTick);
2463 else
2464 supdrvGipUpdatePerCpu(pDevExt, NanoTS, u64TSC, idCpu, ASMGetApicId(), iTick);
2465
2466 ASMSetFlags(fOldFlags);
2467}
2468
2469
2470
2471
2472/*
2473 *
2474 *
2475 * TSC Delta Measurements And Related Code
2476 * TSC Delta Measurements And Related Code
2477 * TSC Delta Measurements And Related Code
2478 *
2479 *
2480 */
2481
2482
2483/*
2484 * Select TSC delta measurement algorithm.
2485 */
2486#if 1
2487# define GIP_TSC_DELTA_METHOD_1
2488#else
2489# define GIP_TSC_DELTA_METHOD_2
2490#endif
2491
2492/** For padding variables to keep them away from other cache lines. Better too
2493 * large than too small!
2494 * @remarks Current AMD64 and x86 CPUs seems to use 64 bytes. There are claims
2495 * that NetBurst had 128 byte cache lines while the 486 thru Pentium
2496 * III had 32 bytes cache lines. */
2497#define GIP_TSC_DELTA_CACHE_LINE_SIZE 128
2498
2499
2500/**
2501 * TSC delta measurment algorithm \#2 result entry.
2502 */
2503typedef struct SUPDRVTSCDELTAMETHOD2ENTRY
2504{
2505 uint32_t iSeqMine;
2506 uint32_t iSeqOther;
2507 uint64_t uTsc;
2508} SUPDRVTSCDELTAMETHOD2ENTRY;
2509
2510/**
2511 * TSC delta measurment algorithm \#2 Data.
2512 */
2513typedef struct SUPDRVTSCDELTAMETHOD2
2514{
2515 /** Padding to make sure the iCurSeqNo is in its own cache line. */
2516 uint64_t au64CacheLinePaddingBefore[GIP_TSC_DELTA_CACHE_LINE_SIZE / sizeof(uint64_t) - 1];
2517 /** The current sequence number of this worker. */
2518 uint32_t volatile iCurSeqNo;
2519 /** Padding to make sure the iCurSeqNo is in its own cache line. */
2520 uint32_t au64CacheLinePaddingAfter[GIP_TSC_DELTA_CACHE_LINE_SIZE / sizeof(uint32_t) - 1];
2521 /** Result table. */
2522 SUPDRVTSCDELTAMETHOD2ENTRY aResults[96];
2523} SUPDRVTSCDELTAMETHOD2;
2524/** Pointer to the data for TSC delta mesurment algorithm \#2 .*/
2525typedef SUPDRVTSCDELTAMETHOD2 *PSUPDRVTSCDELTAMETHOD2;
2526
2527
2528/**
2529 * The TSC delta synchronization struct, version 2.
2530 *
2531 * The syncrhonization variable is completely isolated in its own cache line
2532 * (provided our max cache line size estimate is correct).
2533 */
2534typedef struct SUPTSCDELTASYNC2
2535{
2536 /** Padding to make sure the uVar1 is in its own cache line. */
2537 uint64_t au64CacheLinePaddingBefore[GIP_TSC_DELTA_CACHE_LINE_SIZE / sizeof(uint64_t)];
2538 /** The synchronization variable, holds values GIP_TSC_DELTA_SYNC_*. */
2539 volatile uint32_t uVar1;
2540 /** Unused. */
2541 volatile uint32_t uVar2;
2542
2543 /** Start RDTSC value. This does not need to be in its own cache line, it's
2544 * just put here to save stack space. */
2545 uint64_t uTscStart;
2546 /** Max number of ticks we can allow to elapse in the RTMpOn callback.
2547 * This is estimated from the CPU frequency... */
2548 uint64_t cMaxTicks;
2549
2550 /** Padding to make sure the uVar1 is in its own cache line. */
2551 uint64_t au64CacheLinePaddingAfter[GIP_TSC_DELTA_CACHE_LINE_SIZE / sizeof(uint64_t) - 2];
2552} SUPTSCDELTASYNC2;
2553AssertCompileSize(SUPTSCDELTASYNC2, GIP_TSC_DELTA_CACHE_LINE_SIZE * 2 + sizeof(uint64_t));
2554typedef SUPTSCDELTASYNC2 *PSUPTSCDELTASYNC2;
2555
2556
2557/**
2558 * Argument package/state passed by supdrvMeasureTscDeltaOne to the RTMpOn
2559 * callback worker.
2560 */
2561typedef struct SUPDRVGIPTSCDELTARGS
2562{
2563 /** The device extension. */
2564 PSUPDRVDEVEXT pDevExt;
2565 /** Pointer to the GIP CPU array entry for the worker. */
2566 PSUPGIPCPU pWorker;
2567 /** Pointer to the GIP CPU array entry for the master. */
2568 PSUPGIPCPU pMaster;
2569 /** Pointer to the master's synchronization struct (on stack). */
2570 PSUPTSCDELTASYNC2 pSyncMaster;
2571 /** Pointer to the worker's synchronization struct (on stack). */
2572 PSUPTSCDELTASYNC2 pSyncWorker;
2573
2574#if 0
2575 /** Method 1 data. */
2576 struct
2577 {
2578 } M1;
2579#endif
2580
2581#ifdef GIP_TSC_DELTA_METHOD_2
2582 struct
2583 {
2584 PSUPDRVTSCDELTAMETHOD2 pMasterData;
2585 PSUPDRVTSCDELTAMETHOD2 pWorkerData;
2586 uint32_t cHits;
2587 bool fLagMaster;
2588 bool fLagWorker;
2589 bool volatile fQuitEarly;
2590 } M2;
2591#endif
2592} SUPDRVGIPTSCDELTARGS;
2593typedef SUPDRVGIPTSCDELTARGS *PSUPDRVGIPTSCDELTARGS;
2594
2595
2596/** @name Macros that implements the basic synchronization steps common to
2597 * the algorithms.
2598 * @{
2599 */
2600#define TSCDELTA_MASTER_SYNC_BEFORE(a_pTscDeltaSync) \
2601 do {\
2602 ASMAtomicWriteU32(&(a_pTscDeltaSync)->u, GIP_TSC_DELTA_SYNC_START); \
2603 \
2604 /* Disable interrupts only in the master for as short a period \
2605 as possible, thanks again to Windows. See @bugref{6710} comment #73. */ \
2606 uFlags = ASMIntDisableFlags(); \
2607 \
2608 while (ASMAtomicReadU32(&(a_pTscDeltaSync)->u) == GIP_TSC_DELTA_SYNC_START) \
2609 { /* nothing */ } \
2610 } while (0)
2611#define TSCDELTA_MASTER_SYNC_AFTER(a_pTscDeltaSync) \
2612 do {\
2613 /* Sync up with worker. */ \
2614 ASMSetFlags(uFlags); \
2615 \
2616 while (ASMAtomicReadU32(&(a_pTscDeltaSync)->u) != GIP_TSC_DELTA_SYNC_WORKER_DONE) \
2617 { /* nothing */ } \
2618 } while (0)
2619#define TSCDELTA_MASTER_KICK_OTHER_OUT_OF_AFTER(a_pTscDeltaSync) \
2620 do {\
2621 ASMAtomicWriteU32(&(a_pTscDeltaSync)->u, GIP_TSC_DELTA_SYNC_STOP); \
2622 } while (0)
2623
2624#define TSCDELTA_OTHER_SYNC_BEFORE(a_pTscDeltaSync, a_MidSyncExpr) \
2625 do { \
2626 while (ASMAtomicReadU32(&(a_pTscDeltaSync)->u) != GIP_TSC_DELTA_SYNC_START) \
2627 { /* nothing */ } \
2628 a_MidSyncExpr; \
2629 ASMAtomicWriteU32(&(a_pTscDeltaSync)->u, GIP_TSC_DELTA_SYNC_WORKER_READY); \
2630 } while (0)
2631#define TSCDELTA_OTHER_SYNC_AFTER(a_pTscDeltaSync) \
2632 do { \
2633 /* Tell master we're done collecting our data. */ \
2634 ASMAtomicWriteU32(&(a_pTscDeltaSync)->u, GIP_TSC_DELTA_SYNC_WORKER_DONE); \
2635 \
2636 /* Wait for the master to process the data. */ \
2637 while (ASMAtomicReadU32(&(a_pTscDeltaSync)->u) == GIP_TSC_DELTA_SYNC_WORKER_DONE) \
2638 ASMNopPause(); \
2639 } while (0)
2640/** @} */
2641
2642#ifdef GIP_TSC_DELTA_METHOD_1
2643
2644/**
2645 * TSC delta measurment algorithm \#1 (GIP_TSC_DELTA_METHOD_1).
2646 *
2647 *
2648 * We ignore the first few runs of the loop in order to prime the
2649 * cache. Also, we need to be careful about using 'pause' instruction
2650 * in critical busy-wait loops in this code - it can cause undesired
2651 * behaviour with hyperthreading.
2652 *
2653 * We try to minimize the measurement error by computing the minimum
2654 * read time of the compare statement in the worker by taking TSC
2655 * measurements across it.
2656 *
2657 * It must be noted that the computed minimum read time is mostly to
2658 * eliminate huge deltas when the worker is too early and doesn't by
2659 * itself help produce more accurate deltas. We allow two times the
2660 * computed minimum as an arbibtrary acceptable threshold. Therefore,
2661 * it is still possible to get negative deltas where there are none
2662 * when the worker is earlier. As long as these occasional negative
2663 * deltas are lower than the time it takes to exit guest-context and
2664 * the OS to reschedule EMT on a different CPU we won't expose a TSC
2665 * that jumped backwards. It is because of the existence of the
2666 * negative deltas we don't recompute the delta with the master and
2667 * worker interchanged to eliminate the remaining measurement error.
2668 *
2669 *
2670 * @param pArgs The argument/state data.
2671 * @param pSync The synchronization structure
2672 * (pDevExt->pTscDeltaSync).
2673 * @param fIsMaster Set if master, clear if worker.
2674 * @param iTry The attempt number.
2675 */
2676static void supdrvTscDeltaMethod1Loop(PSUPDRVGIPTSCDELTARGS pArgs, PSUPTSCDELTASYNC pSync, bool fIsMaster, uint32_t iTry)
2677{
2678 PSUPGIPCPU pGipCpuWorker = pArgs->pWorker;
2679 PSUPGIPCPU pGipCpuMaster = pArgs->pMaster;
2680 uint64_t uMinCmpReadTime = UINT64_MAX;
2681 unsigned iLoop;
2682 NOREF(iTry);
2683
2684 for (iLoop = 0; iLoop < GIP_TSC_DELTA_LOOPS; iLoop++)
2685 {
2686 if (fIsMaster)
2687 {
2688 /*
2689 * The master.
2690 */
2691 RTCCUINTREG uFlags;
2692 AssertMsg(pGipCpuMaster->u64TSCSample == GIP_TSC_DELTA_RSVD,
2693 ("%#llx idMaster=%#x idWorker=%#x (idGipMaster=%#x)\n",
2694 pGipCpuMaster->u64TSCSample, pGipCpuMaster->idCpu, pGipCpuWorker->idCpu, pArgs->pDevExt->idGipMaster));
2695 TSCDELTA_MASTER_SYNC_BEFORE(pSync);
2696
2697 do
2698 {
2699 ASMSerializeInstruction();
2700 ASMAtomicWriteU64(&pGipCpuMaster->u64TSCSample, ASMReadTSC());
2701 } while (pGipCpuMaster->u64TSCSample == GIP_TSC_DELTA_RSVD);
2702
2703 TSCDELTA_MASTER_SYNC_AFTER(pSync);
2704
2705 /* Process the data. */
2706 if (iLoop > GIP_TSC_DELTA_PRIMER_LOOPS + GIP_TSC_DELTA_READ_TIME_LOOPS)
2707 {
2708 if (pGipCpuWorker->u64TSCSample != GIP_TSC_DELTA_RSVD)
2709 {
2710 int64_t iDelta = pGipCpuWorker->u64TSCSample
2711 - (pGipCpuMaster->u64TSCSample - pGipCpuMaster->i64TSCDelta);
2712 if ( iDelta >= GIP_TSC_DELTA_INITIAL_MASTER_VALUE
2713 ? iDelta < pGipCpuWorker->i64TSCDelta
2714 : iDelta > pGipCpuWorker->i64TSCDelta || pGipCpuWorker->i64TSCDelta == INT64_MAX)
2715 pGipCpuWorker->i64TSCDelta = iDelta;
2716 }
2717 }
2718
2719 /* Reset our TSC sample and tell the worker to move on. */
2720 ASMAtomicWriteU64(&pGipCpuMaster->u64TSCSample, GIP_TSC_DELTA_RSVD);
2721 TSCDELTA_MASTER_KICK_OTHER_OUT_OF_AFTER(pSync);
2722 }
2723 else
2724 {
2725 /*
2726 * The worker.
2727 */
2728 uint64_t uTscWorker;
2729 uint64_t uTscWorkerFlushed;
2730 uint64_t uCmpReadTime;
2731
2732 ASMAtomicReadU64(&pGipCpuMaster->u64TSCSample); /* Warm the cache line. */
2733 TSCDELTA_OTHER_SYNC_BEFORE(pSync, Assert(pGipCpuMaster->u64TSCSample == GIP_TSC_DELTA_RSVD));
2734
2735 /*
2736 * Keep reading the TSC until we notice that the master has read his. Reading
2737 * the TSC -after- the master has updated the memory is way too late. We thus
2738 * compensate by trying to measure how long it took for the worker to notice
2739 * the memory flushed from the master.
2740 */
2741 do
2742 {
2743 ASMSerializeInstruction();
2744 uTscWorker = ASMReadTSC();
2745 } while (pGipCpuMaster->u64TSCSample == GIP_TSC_DELTA_RSVD);
2746 ASMSerializeInstruction();
2747 uTscWorkerFlushed = ASMReadTSC();
2748
2749 uCmpReadTime = uTscWorkerFlushed - uTscWorker;
2750 if (iLoop > GIP_TSC_DELTA_PRIMER_LOOPS + GIP_TSC_DELTA_READ_TIME_LOOPS)
2751 {
2752 /* This is totally arbitrary a.k.a I don't like it but I have no better ideas for now. */
2753 if (uCmpReadTime < (uMinCmpReadTime << 1))
2754 {
2755 ASMAtomicWriteU64(&pGipCpuWorker->u64TSCSample, uTscWorker);
2756 if (uCmpReadTime < uMinCmpReadTime)
2757 uMinCmpReadTime = uCmpReadTime;
2758 }
2759 else
2760 ASMAtomicWriteU64(&pGipCpuWorker->u64TSCSample, GIP_TSC_DELTA_RSVD);
2761 }
2762 else if (iLoop > GIP_TSC_DELTA_PRIMER_LOOPS)
2763 {
2764 if (uCmpReadTime < uMinCmpReadTime)
2765 uMinCmpReadTime = uCmpReadTime;
2766 }
2767
2768 TSCDELTA_OTHER_SYNC_AFTER(pSync);
2769 }
2770 }
2771
2772 /*
2773 * We must reset the worker TSC sample value in case it gets picked as a
2774 * GIP master later on (it's trashed above, naturally).
2775 */
2776 if (!fIsMaster)
2777 ASMAtomicWriteU64(&pGipCpuWorker->u64TSCSample, GIP_TSC_DELTA_RSVD);
2778}
2779
2780
2781/**
2782 * Initializes the argument/state data belonging to algorithm \#1.
2783 *
2784 * @returns VBox status code.
2785 * @param pArgs The argument/state data.
2786 */
2787static int supdrvTscDeltaMethod1Init(PSUPDRVGIPTSCDELTARGS pArgs)
2788{
2789 NOREF(pArgs);
2790 return VINF_SUCCESS;
2791}
2792
2793
2794/**
2795 * Undoes what supdrvTscDeltaMethod1Init() did.
2796 *
2797 * @param pArgs The argument/state data.
2798 */
2799static void supdrvTscDeltaMethod1Delete(PSUPDRVGIPTSCDELTARGS pArgs)
2800{
2801 NOREF(pArgs);
2802}
2803
2804#endif /* GIP_TSC_DELTA_METHOD_1 */
2805
2806
2807#ifdef GIP_TSC_DELTA_METHOD_2
2808/*
2809 * TSC delta measurement algorithm \#2 configuration and code - Experimental!!
2810 */
2811
2812# define GIP_TSC_DELTA_M2_LOOPS (12 + GIP_TSC_DELTA_M2_PRIMER_LOOPS)
2813# define GIP_TSC_DELTA_M2_PRIMER_LOOPS 1
2814
2815
2816static void supdrvTscDeltaMethod2ProcessDataOnMaster(PSUPDRVGIPTSCDELTARGS pArgs, uint32_t iLoop)
2817{
2818 PSUPDRVTSCDELTAMETHOD2 pMasterData = pArgs->M2.pMasterData;
2819 PSUPDRVTSCDELTAMETHOD2 pOtherData = pArgs->M2.pWorkerData;
2820 int64_t iMasterTscDelta = pArgs->pMaster->i64TSCDelta;
2821 int64_t iBestDelta = pArgs->pWorker->i64TSCDelta;
2822 uint32_t idxResult;
2823 uint32_t cHits = 0;
2824
2825 /*
2826 * Look for matching entries in the master and worker tables.
2827 */
2828 for (idxResult = 0; idxResult < RT_ELEMENTS(pMasterData->aResults); idxResult++)
2829 {
2830 uint32_t idxOther = pMasterData->aResults[idxResult].iSeqOther;
2831 if (idxOther & 1)
2832 {
2833 idxOther >>= 1;
2834 if (idxOther < RT_ELEMENTS(pOtherData->aResults))
2835 {
2836 if (pOtherData->aResults[idxOther].iSeqOther == pMasterData->aResults[idxResult].iSeqMine)
2837 {
2838 int64_t iDelta;
2839 iDelta = pOtherData->aResults[idxOther].uTsc
2840 - (pMasterData->aResults[idxResult].uTsc - iMasterTscDelta);
2841 if ( iDelta >= GIP_TSC_DELTA_INITIAL_MASTER_VALUE
2842 ? iDelta < iBestDelta
2843 : iDelta > iBestDelta || iBestDelta == INT64_MAX)
2844 iBestDelta = iDelta;
2845 cHits++;
2846 }
2847 }
2848 }
2849 }
2850
2851 /*
2852 * Save the results.
2853 */
2854 if (cHits > 2)
2855 pArgs->pWorker->i64TSCDelta = iBestDelta;
2856 pArgs->M2.cHits += cHits;
2857
2858 /*
2859 * Check and see if we can quit a little early. If the result is already
2860 * extremely good (+/-16 ticks seems reasonable), just stop.
2861 */
2862 if ( iBestDelta >= 0 + GIP_TSC_DELTA_INITIAL_MASTER_VALUE
2863 ? iBestDelta <= 16 + GIP_TSC_DELTA_INITIAL_MASTER_VALUE
2864 : iBestDelta >= -16 + GIP_TSC_DELTA_INITIAL_MASTER_VALUE)
2865 {
2866 /*SUPR0Printf("quitting early #1: hits=%#x iLoop=%d iBestDelta=%lld\n", cHits, iLoop, iBestDelta);*/
2867 ASMAtomicWriteBool(&pArgs->M2.fQuitEarly, true);
2868 }
2869 /*
2870 * After a while, just stop if we get sufficent hits.
2871 */
2872 else if ( iLoop >= GIP_TSC_DELTA_M2_LOOPS / 3
2873 && cHits > 8)
2874 {
2875 uint32_t const cHitsNeeded = GIP_TSC_DELTA_M2_LOOPS * RT_ELEMENTS(pArgs->M2.pMasterData->aResults) / 4; /* 25% */
2876 if ( pArgs->M2.cHits >= cHitsNeeded
2877 && ( iBestDelta >= 0 + GIP_TSC_DELTA_INITIAL_MASTER_VALUE
2878 ? iBestDelta <= GIP_TSC_DELTA_THRESHOLD_PRACTICALLY_ZERO + GIP_TSC_DELTA_INITIAL_MASTER_VALUE
2879 : iBestDelta >= -GIP_TSC_DELTA_THRESHOLD_PRACTICALLY_ZERO + GIP_TSC_DELTA_INITIAL_MASTER_VALUE) )
2880 {
2881 /*SUPR0Printf("quitting early hits=%#x (%#x) needed=%#x iLoop=%d iBestDelta=%lld\n",
2882 pArgs->M2.cHits, cHits, cHitsNeeded, iLoop, iBestDelta);*/
2883 ASMAtomicWriteBool(&pArgs->M2.fQuitEarly, true);
2884 }
2885 }
2886}
2887
2888
2889/**
2890 * The core function of the 2nd TSC delta mesurment algorithm.
2891 *
2892 * The idea here is that we have the two CPUs execute the exact same code
2893 * collecting a largish set of TSC samples. The code has one data dependency on
2894 * the other CPU which intention it is to synchronize the execution as well as
2895 * help cross references the two sets of TSC samples (the sequence numbers).
2896 *
2897 * The @a fLag parameter is used to modify the execution a tiny bit on one or
2898 * both of the CPUs. When @a fLag differs between the CPUs, it is thought that
2899 * it will help with making the CPUs enter lock step execution occationally.
2900 *
2901 */
2902static void supdrvTscDeltaMethod2CollectData(PSUPDRVTSCDELTAMETHOD2 pMyData, uint32_t volatile *piOtherSeqNo, bool fLag)
2903{
2904 SUPDRVTSCDELTAMETHOD2ENTRY *pEntry = &pMyData->aResults[0];
2905 uint32_t cLeft = RT_ELEMENTS(pMyData->aResults);
2906
2907 ASMAtomicWriteU32(&pMyData->iCurSeqNo, 0);
2908 ASMSerializeInstruction();
2909 while (cLeft-- > 0)
2910 {
2911 uint64_t uTsc;
2912 uint32_t iSeqMine = ASMAtomicIncU32(&pMyData->iCurSeqNo);
2913 uint32_t iSeqOther = ASMAtomicReadU32(piOtherSeqNo);
2914 ASMCompilerBarrier();
2915 ASMSerializeInstruction(); /* Way better result than with ASMMemoryFenceSSE2() in this position! */
2916 uTsc = ASMReadTSC();
2917 ASMAtomicIncU32(&pMyData->iCurSeqNo);
2918 ASMCompilerBarrier();
2919 ASMSerializeInstruction();
2920 pEntry->iSeqMine = iSeqMine;
2921 pEntry->iSeqOther = iSeqOther;
2922 pEntry->uTsc = uTsc;
2923 pEntry++;
2924 ASMSerializeInstruction();
2925 if (fLag)
2926 ASMNopPause();
2927 }
2928}
2929
2930
2931/**
2932 * TSC delta measurment algorithm \#2 (GIP_TSC_DELTA_METHOD_2).
2933 *
2934 * See supdrvTscDeltaMethod2CollectData for algorithm details.
2935 *
2936 * @param pArgs The argument/state data.
2937 * @param pSync The synchronization structure
2938 * (pDevExt->pTscDeltaSync).
2939 * @param fIsMaster Set if master, clear if worker.
2940 * @param iTry The attempt number.
2941 */
2942static void supdrvTscDeltaMethod2Loop(PSUPDRVGIPTSCDELTARGS pArgs, PSUPTSCDELTASYNC pSync, bool fIsMaster, uint32_t iTry)
2943{
2944 unsigned iLoop;
2945
2946 if (fIsMaster)
2947 ASMAtomicWriteBool(&pArgs->M2.fQuitEarly, false);
2948
2949 for (iLoop = 0; iLoop < GIP_TSC_DELTA_M2_LOOPS; iLoop++)
2950 {
2951 if (fIsMaster)
2952 {
2953 RTCCUINTREG uFlags;
2954
2955 /*
2956 * Adjust the loop lag fudge.
2957 */
2958# if GIP_TSC_DELTA_M2_PRIMER_LOOPS > 0
2959 if (iLoop < GIP_TSC_DELTA_M2_PRIMER_LOOPS)
2960 {
2961 /* Lag during the priming to be nice to everyone.. */
2962 pArgs->M2.fLagMaster = true;
2963 pArgs->M2.fLagWorker = true;
2964 }
2965 else
2966# endif
2967 if (iLoop < (GIP_TSC_DELTA_M2_LOOPS - GIP_TSC_DELTA_M2_PRIMER_LOOPS) / 4)
2968 {
2969 /* 25 % of the body without lagging. */
2970 pArgs->M2.fLagMaster = false;
2971 pArgs->M2.fLagWorker = false;
2972 }
2973 else if (iLoop < (GIP_TSC_DELTA_M2_LOOPS - GIP_TSC_DELTA_M2_PRIMER_LOOPS) / 4 * 2)
2974 {
2975 /* 25 % of the body with both lagging. */
2976 pArgs->M2.fLagMaster = true;
2977 pArgs->M2.fLagWorker = true;
2978 }
2979 else
2980 {
2981 /* 50% of the body with alternating lag. */
2982 pArgs->M2.fLagMaster = (iLoop & 1) == 0;
2983 pArgs->M2.fLagWorker = (iLoop & 1) == 1;
2984 }
2985
2986 /*
2987 * Sync up with the worker and collect data.
2988 */
2989 TSCDELTA_MASTER_SYNC_BEFORE(pSync);
2990 supdrvTscDeltaMethod2CollectData(pArgs->M2.pMasterData, &pArgs->M2.pWorkerData->iCurSeqNo, pArgs->M2.fLagMaster);
2991 TSCDELTA_MASTER_SYNC_AFTER(pSync);
2992
2993 /*
2994 * Process the data.
2995 */
2996# if GIP_TSC_DELTA_M2_PRIMER_LOOPS > 0
2997 if (iLoop >= GIP_TSC_DELTA_M2_PRIMER_LOOPS)
2998# endif
2999 supdrvTscDeltaMethod2ProcessDataOnMaster(pArgs, iLoop);
3000
3001 TSCDELTA_MASTER_KICK_OTHER_OUT_OF_AFTER(pSync);
3002 }
3003 else
3004 {
3005 /*
3006 * The worker.
3007 */
3008 TSCDELTA_OTHER_SYNC_BEFORE(pSync, (void)0);
3009 supdrvTscDeltaMethod2CollectData(pArgs->M2.pWorkerData, &pArgs->M2.pMasterData->iCurSeqNo, pArgs->M2.fLagWorker);
3010 TSCDELTA_OTHER_SYNC_AFTER(pSync);
3011 }
3012
3013 if (ASMAtomicReadBool(&pArgs->M2.fQuitEarly))
3014 break;
3015
3016 }
3017}
3018
3019
3020/**
3021 * Initializes the argument/state data belonging to algorithm \#2.
3022 *
3023 * @returns VBox status code.
3024 * @param pArgs The argument/state data.
3025 */
3026static int supdrvTscDeltaMethod2Init(PSUPDRVGIPTSCDELTARGS pArgs)
3027{
3028 pArgs->M2.pMasterData = NULL;
3029 pArgs->M2.pWorkerData = NULL;
3030
3031 uint32_t const fFlags = /*RTMEMALLOCEX_FLAGS_ANY_CTX |*/ RTMEMALLOCEX_FLAGS_ZEROED;
3032 int rc = RTMemAllocEx(sizeof(*pArgs->M2.pWorkerData), 0, fFlags, (void **)&pArgs->M2.pWorkerData);
3033 if (RT_SUCCESS(rc))
3034 rc = RTMemAllocEx(sizeof(*pArgs->M2.pMasterData), 0, fFlags, (void **)&pArgs->M2.pMasterData);
3035 return rc;
3036}
3037
3038
3039/**
3040 * Undoes what supdrvTscDeltaMethod2Init() did.
3041 *
3042 * @param pArgs The argument/state data.
3043 */
3044static void supdrvTscDeltaMethod2Delete(PSUPDRVGIPTSCDELTARGS pArgs)
3045{
3046 RTMemFreeEx(pArgs->M2.pMasterData, sizeof(*pArgs->M2.pMasterData));
3047 RTMemFreeEx(pArgs->M2.pWorkerData, sizeof(*pArgs->M2.pWorkerData));
3048# if 0
3049 SUPR0Printf("cHits=%d m=%d w=%d\n", pArgs->M2.cHits, pArgs->pMaster->idApic, pArgs->pWorker->idApic);
3050# endif
3051}
3052
3053
3054#endif /* GIP_TSC_DELTA_METHOD_2 */
3055
3056/** Prestart wait. */
3057#define GIP_TSC_DELTA_SYNC2_PRESTART_WAIT UINT32_C(0xffe)
3058
3059/** Start measurement of TSC delta. */
3060#define GIP_TSC_DELTA_SYNC2_START UINT32_C(1)
3061/** Worker thread is ready for reading the TSC. */
3062#define GIP_TSC_DELTA_SYNC2_WORKER_READY UINT32_C(2)
3063/** Worker thread is done updating TSC delta info. */
3064#define GIP_TSC_DELTA_SYNC2_WORKER_DONE UINT32_C(3)
3065/** When IPRT is isn't concurrent safe: Master is ready and will wait for worker
3066 * with a timeout. */
3067#define GIP_TSC_DELTA_SYNC2_PRESTART_MASTER UINT32_C(4)
3068
3069
3070/**
3071 * Callback used by supdrvMeasureInitialTscDeltas() to read the TSC on two CPUs
3072 * and compute the delta between them.
3073 *
3074 * @param idCpu The CPU we are current scheduled on.
3075 * @param pvUser1 Pointer to a parameter package (SUPDRVGIPTSCDELTARGS).
3076 * @param pvUser2 Unused.
3077 *
3078 * @remarks Measuring TSC deltas between the CPUs is tricky because we need to
3079 * read the TSC at exactly the same time on both the master and the
3080 * worker CPUs. Due to DMA, bus arbitration, cache locality,
3081 * contention, SMI, pipelining etc. there is no guaranteed way of
3082 * doing this on x86 CPUs.
3083 */
3084static DECLCALLBACK(void) supdrvMeasureTscDeltaCallback(RTCPUID idCpu, void *pvUser1, void *pvUser2)
3085{
3086 PSUPDRVGIPTSCDELTARGS pArgs = (PSUPDRVGIPTSCDELTARGS)pvUser1;
3087 PSUPDRVDEVEXT pDevExt = pArgs->pDevExt;
3088 PSUPTSCDELTASYNC pSync = pDevExt->pTscDeltaSync;
3089 PSUPGIPCPU pGipCpuWorker = pArgs->pWorker;
3090 PSUPGIPCPU pGipCpuMaster = pArgs->pMaster;
3091 bool const fIsMaster = idCpu == pGipCpuMaster->idCpu;
3092 uint32_t iTry;
3093#if 0
3094 PSUPTSCDELTASYNC2 pOtherSync;
3095 SUPTSCDELTASYNC2 MySync;
3096#endif
3097
3098 /* A bit of paranoia first. */
3099 if (!pGipCpuMaster || !pGipCpuWorker)
3100 return;
3101
3102 /*
3103 * If the CPU isn't part of the measurement, return immediately.
3104 */
3105 if ( !fIsMaster
3106 && idCpu != pGipCpuWorker->idCpu)
3107 return;
3108
3109#if 0
3110 /*
3111 * Set up my synchronization stuff and wait for the other party to show up.
3112 * We don't wait forever since the other party may have gone fishing after
3113 * we checked it out in supdrvMeasureTscDeltaOne, and then there is of course
3114 * windows and it's BSOD if we waste too much time here.
3115 */
3116 if (fIsMaster)
3117 {
3118 MySync.uVar1 = GIP_TSC_DELTA_SYNC2_PRESTART_WAIT;
3119 ASMSerializeInstruction(); ASMCompilerBarrier();
3120 ASMAtomicWritePtr(&pArgs->pSyncMaster, &MySync);
3121 }
3122 else
3123 {
3124 MySync.uVar1 = GIP_TSC_DELTA_SYNC2_PRESTART_WAIT;
3125 ASMSerializeInstruction(); ASMCompilerBarrier();
3126 ASMAtomicWritePtr(&pArgs->pSyncWorker, &MySync);
3127 }
3128
3129 MySync.uTscStart = ASMReadTSC();
3130 MySync.cMaxTicks = u64CpuHz
3131
3132 while ((pOtherSync = ASMAtomicReadPtr((void * volatile *)(fIsMaster ? &pArgs->pSyncWorker : &pArgs->pSyncMaster))) != NULL)
3133 {
3134 uint32_t cInner = 10240;
3135 while ( cInner-- > 0
3136 && ASMAtomicUoReadU32(MySync.uVar1) == GIP_TSC_DELTA_SYNC2_PRESTART_WAIT)
3137 ASMNopPause();
3138
3139 }
3140#endif
3141
3142
3143 /* If the IPRT API isn't concurrent safe, the master and worker wait for each other
3144 with a timeout to avoid deadlocking the entire system. */
3145 if (!RTMpOnAllIsConcurrentSafe())
3146 {
3147 /** @todo This was introduced for Windows, but since Windows doesn't use this
3148 * code path any longer (as DPC timeouts BSOD regardless of interrupts,
3149 * see @bugref{6710} comment 81), eventually phase it out. */
3150 uint64_t uTscNow;
3151 uint64_t uTscStart;
3152 uint64_t const cWaitTicks = 130000; /* Arbitrary value, can be tweaked later. */
3153
3154 ASMSerializeInstruction();
3155 uTscStart = ASMReadTSC();
3156 if (fIsMaster)
3157 {
3158 ASMAtomicWriteU32(&pDevExt->pTscDeltaSync->u, GIP_TSC_DELTA_SYNC_PRESTART_MASTER);
3159 while (ASMAtomicReadU32(&pDevExt->pTscDeltaSync->u) != GIP_TSC_DELTA_SYNC_PRESTART_WORKER)
3160 {
3161 ASMSerializeInstruction();
3162 uTscNow = ASMReadTSC();
3163 if (uTscNow - uTscStart > cWaitTicks)
3164 {
3165 /* Set the worker delta to indicate failure, not the master. */
3166 ASMAtomicWriteS64(&pGipCpuWorker->i64TSCDelta, INT64_MAX);
3167 return;
3168 }
3169
3170 ASMNopPause();
3171 }
3172 }
3173 else
3174 {
3175 while (ASMAtomicReadU32(&pDevExt->pTscDeltaSync->u) != GIP_TSC_DELTA_SYNC_PRESTART_MASTER)
3176 {
3177 ASMSerializeInstruction();
3178 uTscNow = ASMReadTSC();
3179 if (uTscNow - uTscStart > cWaitTicks)
3180 {
3181 ASMAtomicWriteS64(&pGipCpuWorker->i64TSCDelta, INT64_MAX);
3182 return;
3183 }
3184
3185 ASMNopPause();
3186 }
3187 ASMAtomicWriteU32(&pDevExt->pTscDeltaSync->u, GIP_TSC_DELTA_SYNC_PRESTART_WORKER);
3188 }
3189 }
3190
3191 /*
3192 * Retry loop.
3193 */
3194 Assert(pGipCpuWorker->i64TSCDelta == INT64_MAX);
3195 for (iTry = 0; iTry < 12; iTry++)
3196 {
3197 /*
3198 * Do the measurements.
3199 */
3200#ifdef GIP_TSC_DELTA_METHOD_1
3201 supdrvTscDeltaMethod1Loop(pArgs, pSync, fIsMaster, iTry);
3202#elif defined(GIP_TSC_DELTA_METHOD_2)
3203 supdrvTscDeltaMethod2Loop(pArgs, pSync, fIsMaster, iTry);
3204#else
3205# error "huh??"
3206#endif
3207
3208 /*
3209 * Success? If so, stop trying.
3210 */
3211 if (pGipCpuWorker->i64TSCDelta != INT64_MAX)
3212 {
3213 if (fIsMaster)
3214 {
3215 RTCpuSetDelByIndex(&pDevExt->TscDeltaCpuSet, pGipCpuMaster->iCpuSet);
3216 RTCpuSetAddByIndex(&pDevExt->TscDeltaObtainedCpuSet, pGipCpuMaster->iCpuSet);
3217 }
3218 else
3219 {
3220 RTCpuSetDelByIndex(&pDevExt->TscDeltaCpuSet, pGipCpuWorker->iCpuSet);
3221 RTCpuSetAddByIndex(&pDevExt->TscDeltaObtainedCpuSet, pGipCpuWorker->iCpuSet);
3222 }
3223 break;
3224 }
3225 }
3226}
3227
3228
3229/**
3230 * Clears TSC delta related variables.
3231 *
3232 * Clears all TSC samples as well as the delta synchronization variable on the
3233 * all the per-CPU structs. Optionally also clears the per-cpu deltas too.
3234 *
3235 * @param pDevExt Pointer to the device instance data.
3236 * @param fClearDeltas Whether the deltas are also to be cleared.
3237 */
3238DECLINLINE(void) supdrvClearTscSamples(PSUPDRVDEVEXT pDevExt, bool fClearDeltas)
3239{
3240 unsigned iCpu;
3241 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
3242 for (iCpu = 0; iCpu < pGip->cCpus; iCpu++)
3243 {
3244 PSUPGIPCPU pGipCpu = &pGip->aCPUs[iCpu];
3245 ASMAtomicWriteU64(&pGipCpu->u64TSCSample, GIP_TSC_DELTA_RSVD);
3246 if (fClearDeltas)
3247 ASMAtomicWriteS64(&pGipCpu->i64TSCDelta, INT64_MAX);
3248 }
3249 ASMAtomicWriteU32(&pDevExt->pTscDeltaSync->u, GIP_TSC_DELTA_SYNC_STOP);
3250}
3251
3252
3253/**
3254 * Measures the TSC delta between the master GIP CPU and one specified worker
3255 * CPU.
3256 *
3257 * @returns VBox status code.
3258 * @retval VERR_SUPDRV_TSC_DELTA_MEASUREMENT_FAILED on pure measurement
3259 * failure.
3260 * @param pDevExt Pointer to the device instance data.
3261 * @param idxWorker The index of the worker CPU from the GIP's array of
3262 * CPUs.
3263 *
3264 * @remarks This must be called with preemption enabled!
3265 */
3266static int supdrvMeasureTscDeltaOne(PSUPDRVDEVEXT pDevExt, uint32_t idxWorker)
3267{
3268 int rc;
3269 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
3270 RTCPUID idMaster = pDevExt->idGipMaster;
3271 PSUPGIPCPU pGipCpuWorker = &pGip->aCPUs[idxWorker];
3272 PSUPGIPCPU pGipCpuMaster;
3273 uint32_t iGipCpuMaster;
3274
3275 /* Validate input a bit. */
3276 AssertReturn(pGip, VERR_INVALID_PARAMETER);
3277 Assert(pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED);
3278 Assert(RTThreadPreemptIsEnabled(NIL_RTTHREAD));
3279
3280 /*
3281 * Don't attempt measuring the delta for the GIP master.
3282 */
3283 if (pGipCpuWorker->idCpu == idMaster)
3284 {
3285 if (pGipCpuWorker->i64TSCDelta == INT64_MAX) /* This shouldn't happen, but just in case. */
3286 ASMAtomicWriteS64(&pGipCpuWorker->i64TSCDelta, GIP_TSC_DELTA_INITIAL_MASTER_VALUE);
3287 return VINF_SUCCESS;
3288 }
3289
3290 /*
3291 * If the CPU has hyper-threading and the APIC IDs of the master and worker are adjacent,
3292 * try pick a different master. (This fudge only works with multi core systems.)
3293 * ASSUMES related threads have adjacent APIC IDs. ASSUMES two threads per core.
3294 */
3295 iGipCpuMaster = supdrvGipFindCpuIndexForCpuId(pGip, idMaster);
3296 AssertReturn(iGipCpuMaster < pGip->cCpus, VERR_INVALID_CPU_ID);
3297 pGipCpuMaster = &pGip->aCPUs[iGipCpuMaster];
3298 if ( (pGipCpuMaster->idApic & ~1) == (pGipCpuWorker->idApic & ~1)
3299 && ASMHasCpuId()
3300 && ASMIsValidStdRange(ASMCpuId_EAX(0))
3301 && (ASMCpuId_EDX(1) & X86_CPUID_FEATURE_EDX_HTT)
3302 && pGip->cOnlineCpus > 2)
3303 {
3304 uint32_t i;
3305 for (i = 0; i < pGip->cCpus; i++)
3306 if ( i != iGipCpuMaster
3307 && i != idxWorker
3308 && pGip->aCPUs[i].enmState == SUPGIPCPUSTATE_ONLINE
3309 && pGip->aCPUs[i].i64TSCDelta != INT64_MAX
3310 && pGip->aCPUs[i].idCpu != NIL_RTCPUID
3311 && pGip->aCPUs[i].idCpu != idMaster /* paranoia starts here... */
3312 && pGip->aCPUs[i].idCpu != pGipCpuWorker->idCpu
3313 && pGip->aCPUs[i].idApic != pGipCpuWorker->idApic
3314 && pGip->aCPUs[i].idApic != pGipCpuMaster->idApic)
3315 {
3316 iGipCpuMaster = i;
3317 pGipCpuMaster = &pGip->aCPUs[i];
3318 idMaster = pGipCpuMaster->idCpu;
3319 break;
3320 }
3321 }
3322
3323 /*
3324 * Set the master TSC as the initiator. This serializes delta measurments.
3325 */
3326 while (!ASMAtomicCmpXchgU32(&pDevExt->idTscDeltaInitiator, idMaster, NIL_RTCPUID))
3327 {
3328 /*
3329 * Sleep here rather than spin as there is a parallel measurement
3330 * being executed and that can take a good while to be done.
3331 */
3332 RTThreadSleep(1);
3333 }
3334
3335 if (RTCpuSetIsMemberByIndex(&pGip->OnlineCpuSet, pGipCpuWorker->iCpuSet))
3336 {
3337 /*
3338 * Initialize data package for the RTMpOnAll callback.
3339 */
3340 SUPDRVGIPTSCDELTARGS Args;
3341 RT_ZERO(Args);
3342 Args.pWorker = pGipCpuWorker;
3343 Args.pMaster = pGipCpuMaster;
3344 Args.pDevExt = pDevExt;
3345 Args.pSyncMaster = NULL;
3346 Args.pSyncWorker = NULL;
3347#ifdef GIP_TSC_DELTA_METHOD_1
3348 rc = supdrvTscDeltaMethod1Init(&Args);
3349#elif defined(GIP_TSC_DELTA_METHOD_2)
3350 rc = supdrvTscDeltaMethod2Init(&Args);
3351#else
3352# error "huh?"
3353#endif
3354 if (RT_SUCCESS(rc))
3355 {
3356 /*
3357 * Fire TSC-read workers on all CPUs but only synchronize between master
3358 * and one worker to ease memory contention.
3359 */
3360 ASMAtomicWriteS64(&pGipCpuWorker->i64TSCDelta, INT64_MAX);
3361 ASMAtomicWriteU32(&pDevExt->pTscDeltaSync->u, GIP_TSC_DELTA_SYNC_STOP);
3362
3363 rc = RTMpOnAll(supdrvMeasureTscDeltaCallback, &Args, NULL);
3364 if (RT_SUCCESS(rc))
3365 {
3366 if (RT_LIKELY(pGipCpuWorker->i64TSCDelta != INT64_MAX))
3367 {
3368 /*
3369 * Work the TSC delta applicability rating. It starts
3370 * optimistic in supdrvGipInit, we downgrade it here.
3371 */
3372 SUPGIPUSETSCDELTA enmRating;
3373 if ( pGipCpuWorker->i64TSCDelta > GIP_TSC_DELTA_THRESHOLD_ROUGHLY_ZERO
3374 || pGipCpuWorker->i64TSCDelta < -GIP_TSC_DELTA_THRESHOLD_ROUGHLY_ZERO)
3375 enmRating = SUPGIPUSETSCDELTA_NOT_ZERO;
3376 else if ( pGipCpuWorker->i64TSCDelta > GIP_TSC_DELTA_THRESHOLD_PRACTICALLY_ZERO
3377 || pGipCpuWorker->i64TSCDelta < -GIP_TSC_DELTA_THRESHOLD_PRACTICALLY_ZERO)
3378 enmRating = SUPGIPUSETSCDELTA_ROUGHLY_ZERO;
3379 else
3380 enmRating = SUPGIPUSETSCDELTA_PRACTICALLY_ZERO;
3381 if (pGip->enmUseTscDelta < enmRating)
3382 {
3383 AssertCompile(sizeof(pGip->enmUseTscDelta) == sizeof(uint32_t));
3384 ASMAtomicWriteU32((uint32_t volatile *)&pGip->enmUseTscDelta, enmRating);
3385 }
3386 }
3387 else
3388 rc = VERR_SUPDRV_TSC_DELTA_MEASUREMENT_FAILED;
3389 }
3390 }
3391
3392#ifdef GIP_TSC_DELTA_METHOD_1
3393 supdrvTscDeltaMethod1Delete(&Args);
3394#elif defined(GIP_TSC_DELTA_METHOD_2)
3395 supdrvTscDeltaMethod2Delete(&Args);
3396#else
3397# error "huh?"
3398#endif
3399 }
3400 else
3401 rc = VERR_CPU_OFFLINE;
3402
3403 ASMAtomicWriteU32(&pDevExt->idTscDeltaInitiator, NIL_RTCPUID);
3404 return rc;
3405}
3406
3407
3408/**
3409 * Performs the initial measurements of the TSC deltas between CPUs.
3410 *
3411 * This is called by supdrvGipCreate or triggered by it if threaded.
3412 *
3413 * @returns VBox status code.
3414 * @param pDevExt Pointer to the device instance data.
3415 *
3416 * @remarks Must be called only after supdrvGipInitOnCpu() as this function uses
3417 * idCpu, GIP's online CPU set which are populated in
3418 * supdrvGipInitOnCpu().
3419 */
3420static int supdrvMeasureInitialTscDeltas(PSUPDRVDEVEXT pDevExt)
3421{
3422 PSUPGIPCPU pGipCpuMaster;
3423 unsigned iCpu;
3424 unsigned iOddEven;
3425 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
3426 uint32_t idxMaster = UINT32_MAX;
3427 int rc = VINF_SUCCESS;
3428 uint32_t cMpOnOffEvents = ASMAtomicReadU32(&pDevExt->cMpOnOffEvents);
3429
3430 Assert(pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED);
3431
3432 /*
3433 * Pick the first CPU online as the master TSC and make it the new GIP master based
3434 * on the APIC ID.
3435 *
3436 * Technically we can simply use "idGipMaster" but doing this gives us master as CPU 0
3437 * in most cases making it nicer/easier for comparisons. It is safe to update the GIP
3438 * master as this point since the sync/async timer isn't created yet.
3439 */
3440 supdrvClearTscSamples(pDevExt, true /* fClearDeltas */);
3441 for (iCpu = 0; iCpu < RT_ELEMENTS(pGip->aiCpuFromApicId); iCpu++)
3442 {
3443 uint16_t idxCpu = pGip->aiCpuFromApicId[iCpu];
3444 if (idxCpu != UINT16_MAX)
3445 {
3446 PSUPGIPCPU pGipCpu = &pGip->aCPUs[idxCpu];
3447 if (RTCpuSetIsMemberByIndex(&pGip->OnlineCpuSet, pGipCpu->iCpuSet))
3448 {
3449 idxMaster = idxCpu;
3450 pGipCpu->i64TSCDelta = GIP_TSC_DELTA_INITIAL_MASTER_VALUE;
3451 break;
3452 }
3453 }
3454 }
3455 AssertReturn(idxMaster != UINT32_MAX, VERR_CPU_NOT_FOUND);
3456 pGipCpuMaster = &pGip->aCPUs[idxMaster];
3457 ASMAtomicWriteSize(&pDevExt->idGipMaster, pGipCpuMaster->idCpu);
3458
3459 /*
3460 * If there is only a single CPU online we have nothing to do.
3461 */
3462 if (pGip->cOnlineCpus <= 1)
3463 {
3464 AssertReturn(pGip->cOnlineCpus > 0, VERR_INTERNAL_ERROR_5);
3465 return VINF_SUCCESS;
3466 }
3467
3468 /*
3469 * Loop thru the GIP CPU array and get deltas for each CPU (except the
3470 * master). We do the CPUs with the even numbered APIC IDs first so that
3471 * we've got alternative master CPUs to pick from on hyper-threaded systems.
3472 */
3473 for (iOddEven = 0; iOddEven < 2; iOddEven++)
3474 {
3475 for (iCpu = 0; iCpu < pGip->cCpus; iCpu++)
3476 {
3477 PSUPGIPCPU pGipCpuWorker = &pGip->aCPUs[iCpu];
3478 if ( iCpu != idxMaster
3479 && (iOddEven > 0 || (pGipCpuWorker->idApic & 1) == 0)
3480 && RTCpuSetIsMemberByIndex(&pDevExt->TscDeltaCpuSet, pGipCpuWorker->iCpuSet))
3481 {
3482 rc = supdrvMeasureTscDeltaOne(pDevExt, iCpu);
3483 if (RT_FAILURE(rc))
3484 {
3485 SUPR0Printf("supdrvMeasureTscDeltaOne failed. rc=%d CPU[%u].idCpu=%u Master[%u].idCpu=%u\n", rc, iCpu,
3486 pGipCpuWorker->idCpu, idxMaster, pDevExt->idGipMaster, pGipCpuMaster->idCpu);
3487 break;
3488 }
3489
3490 if (ASMAtomicReadU32(&pDevExt->cMpOnOffEvents) != cMpOnOffEvents)
3491 {
3492 SUPR0Printf("One or more CPUs transitioned between online & offline states. I'm confused, retry...\n");
3493 rc = VERR_TRY_AGAIN;
3494 break;
3495 }
3496 }
3497 }
3498 }
3499
3500 return rc;
3501}
3502
3503
3504#ifdef SUPDRV_USE_TSC_DELTA_THREAD
3505
3506/**
3507 * Switches the TSC-delta measurement thread into the butchered state.
3508 *
3509 * @returns VBox status code.
3510 * @param pDevExt Pointer to the device instance data.
3511 * @param fSpinlockHeld Whether the TSC-delta spinlock is held or not.
3512 * @param pszFailed An error message to log.
3513 * @param rcFailed The error code to exit the thread with.
3514 */
3515static int supdrvTscDeltaThreadButchered(PSUPDRVDEVEXT pDevExt, bool fSpinlockHeld, const char *pszFailed, int rcFailed)
3516{
3517 if (!fSpinlockHeld)
3518 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
3519
3520 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Butchered;
3521 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
3522 OSDBGPRINT(("supdrvTscDeltaThreadButchered: %s. rc=%Rrc\n", rcFailed));
3523 return rcFailed;
3524}
3525
3526
3527/**
3528 * The TSC-delta measurement thread.
3529 *
3530 * @returns VBox status code.
3531 * @param hThread The thread handle.
3532 * @param pvUser Opaque pointer to the device instance data.
3533 */
3534static DECLCALLBACK(int) supdrvTscDeltaThread(RTTHREAD hThread, void *pvUser)
3535{
3536 PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser;
3537 bool fInitialMeasurement = true;
3538 uint32_t cConsecutiveTimeouts = 0;
3539 int rc = VERR_INTERNAL_ERROR_2;
3540 for (;;)
3541 {
3542 /*
3543 * Switch on the current state.
3544 */
3545 SUPDRVTSCDELTATHREADSTATE enmState;
3546 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
3547 enmState = pDevExt->enmTscDeltaThreadState;
3548 switch (enmState)
3549 {
3550 case kTscDeltaThreadState_Creating:
3551 {
3552 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Listening;
3553 rc = RTSemEventSignal(pDevExt->hTscDeltaEvent);
3554 if (RT_FAILURE(rc))
3555 return supdrvTscDeltaThreadButchered(pDevExt, true /* fSpinlockHeld */, "RTSemEventSignal", rc);
3556 /* fall thru */
3557 }
3558
3559 case kTscDeltaThreadState_Listening:
3560 {
3561 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
3562
3563 /* Simple adaptive timeout. */
3564 if (cConsecutiveTimeouts++ == 10)
3565 {
3566 if (pDevExt->cMsTscDeltaTimeout == 1) /* 10 ms */
3567 pDevExt->cMsTscDeltaTimeout = 10;
3568 else if (pDevExt->cMsTscDeltaTimeout == 10) /* +100 ms */
3569 pDevExt->cMsTscDeltaTimeout = 100;
3570 else if (pDevExt->cMsTscDeltaTimeout == 100) /* +1000 ms */
3571 pDevExt->cMsTscDeltaTimeout = 500;
3572 cConsecutiveTimeouts = 0;
3573 }
3574 rc = RTThreadUserWait(pDevExt->hTscDeltaThread, pDevExt->cMsTscDeltaTimeout);
3575 if ( RT_FAILURE(rc)
3576 && rc != VERR_TIMEOUT)
3577 return supdrvTscDeltaThreadButchered(pDevExt, false /* fSpinlockHeld */, "RTThreadUserWait", rc);
3578 RTThreadUserReset(pDevExt->hTscDeltaThread);
3579 break;
3580 }
3581
3582 case kTscDeltaThreadState_WaitAndMeasure:
3583 {
3584 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Measuring;
3585 rc = RTSemEventSignal(pDevExt->hTscDeltaEvent); /* (Safe on windows as long as spinlock isn't IRQ safe.) */
3586 if (RT_FAILURE(rc))
3587 return supdrvTscDeltaThreadButchered(pDevExt, true /* fSpinlockHeld */, "RTSemEventSignal", rc);
3588 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
3589 pDevExt->cMsTscDeltaTimeout = 1;
3590 RTThreadSleep(10);
3591 /* fall thru */
3592 }
3593
3594 case kTscDeltaThreadState_Measuring:
3595 {
3596 cConsecutiveTimeouts = 0;
3597 if (fInitialMeasurement)
3598 {
3599 int cTries = 8;
3600 int cMsWaitPerTry = 10;
3601 fInitialMeasurement = false;
3602 do
3603 {
3604 rc = supdrvMeasureInitialTscDeltas(pDevExt);
3605 if ( RT_SUCCESS(rc)
3606 || ( RT_FAILURE(rc)
3607 && rc != VERR_TRY_AGAIN
3608 && rc != VERR_CPU_OFFLINE))
3609 {
3610 break;
3611 }
3612 RTThreadSleep(cMsWaitPerTry);
3613 } while (cTries-- > 0);
3614 }
3615 else
3616 {
3617 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
3618 unsigned iCpu;
3619
3620 /* Measure TSC-deltas only for the CPUs that are in the set. */
3621 rc = VINF_SUCCESS;
3622 for (iCpu = 0; iCpu < pGip->cCpus; iCpu++)
3623 {
3624 PSUPGIPCPU pGipCpuWorker = &pGip->aCPUs[iCpu];
3625 if ( pGipCpuWorker->i64TSCDelta == INT64_MAX
3626 && RTCpuSetIsMemberByIndex(&pDevExt->TscDeltaCpuSet, pGipCpuWorker->iCpuSet))
3627 {
3628 rc |= supdrvMeasureTscDeltaOne(pDevExt, iCpu);
3629 }
3630 }
3631 }
3632 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
3633 if (pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_Measuring)
3634 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Listening;
3635 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
3636 Assert(rc != VERR_NOT_AVAILABLE); /* VERR_NOT_AVAILABLE is used as the initial value. */
3637 ASMAtomicWriteS32(&pDevExt->rcTscDelta, rc);
3638 break;
3639 }
3640
3641 case kTscDeltaThreadState_Terminating:
3642 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Destroyed;
3643 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
3644 return VINF_SUCCESS;
3645
3646 case kTscDeltaThreadState_Butchered:
3647 default:
3648 return supdrvTscDeltaThreadButchered(pDevExt, true /* fSpinlockHeld */, "Invalid state", VERR_INVALID_STATE);
3649 }
3650 }
3651
3652 return rc;
3653}
3654
3655
3656/**
3657 * Waits for the TSC-delta measurement thread to respond to a state change.
3658 *
3659 * @returns VINF_SUCCESS on success, VERR_TIMEOUT if it doesn't respond in time,
3660 * other error code on internal error.
3661 *
3662 * @param pThis Pointer to the grant service instance data.
3663 * @param enmCurState The current state.
3664 * @param enmNewState The new state we're waiting for it to enter.
3665 */
3666static int supdrvTscDeltaThreadWait(PSUPDRVDEVEXT pDevExt, SUPDRVTSCDELTATHREADSTATE enmCurState,
3667 SUPDRVTSCDELTATHREADSTATE enmNewState)
3668{
3669 /*
3670 * Wait a short while for the expected state transition.
3671 */
3672 int rc;
3673 RTSemEventWait(pDevExt->hTscDeltaEvent, RT_MS_1SEC);
3674 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
3675 if (pDevExt->enmTscDeltaThreadState == enmNewState)
3676 {
3677 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
3678 rc = VINF_SUCCESS;
3679 }
3680 else if (pDevExt->enmTscDeltaThreadState == enmCurState)
3681 {
3682 /*
3683 * Wait longer if the state has not yet transitioned to the one we want.
3684 */
3685 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
3686 rc = RTSemEventWait(pDevExt->hTscDeltaEvent, 50 * RT_MS_1SEC);
3687 if ( RT_SUCCESS(rc)
3688 || rc == VERR_TIMEOUT)
3689 {
3690 /*
3691 * Check the state whether we've succeeded.
3692 */
3693 SUPDRVTSCDELTATHREADSTATE enmState;
3694 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
3695 enmState = pDevExt->enmTscDeltaThreadState;
3696 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
3697 if (enmState == enmNewState)
3698 rc = VINF_SUCCESS;
3699 else if (enmState == enmCurState)
3700 {
3701 rc = VERR_TIMEOUT;
3702 OSDBGPRINT(("supdrvTscDeltaThreadWait: timed out state transition. enmState=%d enmNewState=%d\n", enmState,
3703 enmNewState));
3704 }
3705 else
3706 {
3707 rc = VERR_INTERNAL_ERROR;
3708 OSDBGPRINT(("supdrvTscDeltaThreadWait: invalid state transition from %d to %d, expected %d\n", enmCurState,
3709 enmState, enmNewState));
3710 }
3711 }
3712 else
3713 OSDBGPRINT(("supdrvTscDeltaThreadWait: RTSemEventWait failed. rc=%Rrc\n", rc));
3714 }
3715 else
3716 {
3717 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
3718 OSDBGPRINT(("supdrvTscDeltaThreadWait: invalid state transition from %d to %d\n", enmCurState, enmNewState));
3719 rc = VERR_INTERNAL_ERROR;
3720 }
3721
3722 return rc;
3723}
3724
3725
3726/**
3727 * Waits for TSC-delta measurements to be completed for all online CPUs.
3728 *
3729 * @returns VBox status code.
3730 * @param pDevExt Pointer to the device instance data.
3731 */
3732static int supdrvTscDeltaThreadWaitForOnlineCpus(PSUPDRVDEVEXT pDevExt)
3733{
3734 int cTriesLeft = 5;
3735 int cMsTotalWait;
3736 int cMsWaited = 0;
3737 int cMsWaitGranularity = 1;
3738
3739 PSUPGLOBALINFOPAGE pGip = pDevExt->pGip;
3740 AssertReturn(pGip, VERR_INVALID_POINTER);
3741
3742 if (RT_UNLIKELY(pDevExt->hTscDeltaThread == NIL_RTTHREAD))
3743 return VERR_THREAD_NOT_WAITABLE;
3744
3745 cMsTotalWait = RT_MIN(pGip->cPresentCpus + 10, 200);
3746 while (cTriesLeft-- > 0)
3747 {
3748 if (RTCpuSetIsEqual(&pDevExt->TscDeltaObtainedCpuSet, &pGip->OnlineCpuSet))
3749 return VINF_SUCCESS;
3750 RTThreadSleep(cMsWaitGranularity);
3751 cMsWaited += cMsWaitGranularity;
3752 if (cMsWaited >= cMsTotalWait)
3753 break;
3754 }
3755
3756 return VERR_TIMEOUT;
3757}
3758
3759
3760/**
3761 * Terminates the actual thread running supdrvTscDeltaThread().
3762 *
3763 * This is an internal worker function for supdrvTscDeltaThreadInit() and
3764 * supdrvTscDeltaTerm().
3765 *
3766 * @param pDevExt Pointer to the device instance data.
3767 */
3768static void supdrvTscDeltaThreadTerminate(PSUPDRVDEVEXT pDevExt)
3769{
3770 int rc;
3771 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
3772 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Terminating;
3773 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
3774 RTThreadUserSignal(pDevExt->hTscDeltaThread);
3775 rc = RTThreadWait(pDevExt->hTscDeltaThread, 50 * RT_MS_1SEC, NULL /* prc */);
3776 if (RT_FAILURE(rc))
3777 {
3778 /* Signal a few more times before giving up. */
3779 int cTriesLeft = 5;
3780 while (--cTriesLeft > 0)
3781 {
3782 RTThreadUserSignal(pDevExt->hTscDeltaThread);
3783 rc = RTThreadWait(pDevExt->hTscDeltaThread, 2 * RT_MS_1SEC, NULL /* prc */);
3784 if (rc != VERR_TIMEOUT)
3785 break;
3786 }
3787 }
3788}
3789
3790
3791/**
3792 * Initializes and spawns the TSC-delta measurement thread.
3793 *
3794 * A thread is required for servicing re-measurement requests from events like
3795 * CPUs coming online, suspend/resume etc. as it cannot be done synchronously
3796 * under all contexts on all OSs.
3797 *
3798 * @returns VBox status code.
3799 * @param pDevExt Pointer to the device instance data.
3800 *
3801 * @remarks Must only be called -after- initializing GIP and setting up MP
3802 * notifications!
3803 */
3804static int supdrvTscDeltaThreadInit(PSUPDRVDEVEXT pDevExt)
3805{
3806 int rc;
3807 Assert(pDevExt->pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED);
3808 rc = RTSpinlockCreate(&pDevExt->hTscDeltaSpinlock, RTSPINLOCK_FLAGS_INTERRUPT_UNSAFE, "VBoxTscSpnLck");
3809 if (RT_SUCCESS(rc))
3810 {
3811 rc = RTSemEventCreate(&pDevExt->hTscDeltaEvent);
3812 if (RT_SUCCESS(rc))
3813 {
3814 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_Creating;
3815 pDevExt->cMsTscDeltaTimeout = 1;
3816 rc = RTThreadCreate(&pDevExt->hTscDeltaThread, supdrvTscDeltaThread, pDevExt, 0 /* cbStack */,
3817 RTTHREADTYPE_DEFAULT, RTTHREADFLAGS_WAITABLE, "VBoxTscThread");
3818 if (RT_SUCCESS(rc))
3819 {
3820 rc = supdrvTscDeltaThreadWait(pDevExt, kTscDeltaThreadState_Creating, kTscDeltaThreadState_Listening);
3821 if (RT_SUCCESS(rc))
3822 {
3823 ASMAtomicWriteS32(&pDevExt->rcTscDelta, VERR_NOT_AVAILABLE);
3824 return rc;
3825 }
3826
3827 OSDBGPRINT(("supdrvTscDeltaInit: supdrvTscDeltaThreadWait failed. rc=%Rrc\n", rc));
3828 supdrvTscDeltaThreadTerminate(pDevExt);
3829 }
3830 else
3831 OSDBGPRINT(("supdrvTscDeltaInit: RTThreadCreate failed. rc=%Rrc\n", rc));
3832 RTSemEventDestroy(pDevExt->hTscDeltaEvent);
3833 pDevExt->hTscDeltaEvent = NIL_RTSEMEVENT;
3834 }
3835 else
3836 OSDBGPRINT(("supdrvTscDeltaInit: RTSemEventCreate failed. rc=%Rrc\n", rc));
3837 RTSpinlockDestroy(pDevExt->hTscDeltaSpinlock);
3838 pDevExt->hTscDeltaSpinlock = NIL_RTSPINLOCK;
3839 }
3840 else
3841 OSDBGPRINT(("supdrvTscDeltaInit: RTSpinlockCreate failed. rc=%Rrc\n", rc));
3842
3843 return rc;
3844}
3845
3846
3847/**
3848 * Terminates the TSC-delta measurement thread and cleanup.
3849 *
3850 * @param pDevExt Pointer to the device instance data.
3851 */
3852static void supdrvTscDeltaTerm(PSUPDRVDEVEXT pDevExt)
3853{
3854 if ( pDevExt->hTscDeltaSpinlock != NIL_RTSPINLOCK
3855 && pDevExt->hTscDeltaEvent != NIL_RTSEMEVENT)
3856 {
3857 supdrvTscDeltaThreadTerminate(pDevExt);
3858 }
3859
3860 if (pDevExt->hTscDeltaSpinlock != NIL_RTSPINLOCK)
3861 {
3862 RTSpinlockDestroy(pDevExt->hTscDeltaSpinlock);
3863 pDevExt->hTscDeltaSpinlock = NIL_RTSPINLOCK;
3864 }
3865
3866 if (pDevExt->hTscDeltaEvent != NIL_RTSEMEVENT)
3867 {
3868 RTSemEventDestroy(pDevExt->hTscDeltaEvent);
3869 pDevExt->hTscDeltaEvent = NIL_RTSEMEVENT;
3870 }
3871
3872 ASMAtomicWriteS32(&pDevExt->rcTscDelta, VERR_NOT_AVAILABLE);
3873}
3874
3875#endif /* SUPDRV_USE_TSC_DELTA_THREAD */
3876
3877/**
3878 * Measure the TSC delta for the CPU given by its CPU set index.
3879 *
3880 * @returns VBox status code.
3881 * @retval VERR_INTERRUPTED if interrupted while waiting.
3882 * @retval VERR_SUPDRV_TSC_DELTA_MEASUREMENT_FAILED if we were unable to get a
3883 * measurment.
3884 * @retval VERR_CPU_OFFLINE if the specified CPU is offline.
3885 * @retval VERR_CPU_OFFLINE if the specified CPU is offline.
3886 *
3887 * @param pSession The caller's session. GIP must've been mapped.
3888 * @param iCpuSet The CPU set index of the CPU to measure.
3889 * @param fFlags Flags, SUP_TSCDELTA_MEASURE_F_XXX.
3890 * @param cMsWaitRetry Number of milliseconds to wait between each retry.
3891 * @param cMsWaitThread Number of milliseconds to wait for the thread to get
3892 * ready.
3893 * @param cTries Number of times to try, pass 0 for the default.
3894 */
3895SUPR0DECL(int) SUPR0TscDeltaMeasureBySetIndex(PSUPDRVSESSION pSession, uint32_t iCpuSet, uint32_t fFlags,
3896 RTMSINTERVAL cMsWaitRetry, RTMSINTERVAL cMsWaitThread, uint32_t cTries)
3897{
3898 PSUPDRVDEVEXT pDevExt;
3899 PSUPGLOBALINFOPAGE pGip;
3900 uint16_t iGipCpu;
3901 int rc;
3902#ifdef SUPDRV_USE_TSC_DELTA_THREAD
3903 uint64_t msTsStartWait;
3904 uint32_t iWaitLoop;
3905#endif
3906
3907 /*
3908 * Validate and adjust the input.
3909 */
3910 AssertReturn(SUP_IS_SESSION_VALID(pSession), VERR_INVALID_PARAMETER);
3911 if (!pSession->fGipReferenced)
3912 return VERR_WRONG_ORDER;
3913
3914 pDevExt = pSession->pDevExt;
3915 AssertReturn(SUP_IS_DEVEXT_VALID(pDevExt), VERR_INVALID_PARAMETER);
3916
3917 pGip = pDevExt->pGip;
3918 AssertPtrReturn(pGip, VERR_INTERNAL_ERROR_2);
3919
3920 AssertReturn(iCpuSet < RTCPUSET_MAX_CPUS, VERR_INVALID_CPU_INDEX);
3921 AssertReturn(iCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx), VERR_INVALID_CPU_INDEX);
3922 iGipCpu = pGip->aiCpuFromCpuSetIdx[iCpuSet];
3923 AssertReturn(iGipCpu < pGip->cCpus, VERR_INVALID_CPU_INDEX);
3924
3925 if (fFlags & ~SUP_TSCDELTA_MEASURE_F_VALID_MASK)
3926 return VERR_INVALID_FLAGS;
3927
3928 if (cTries == 0)
3929 cTries = 12;
3930 else if (cTries > 256)
3931 cTries = 256;
3932
3933 if (cMsWaitRetry > 1000)
3934 cMsWaitRetry = 1000;
3935
3936 /*
3937 * The request is a noop if the TSC delta isn't being used.
3938 */
3939 if (pGip->enmUseTscDelta <= SUPGIPUSETSCDELTA_ZERO_CLAIMED)
3940 return VINF_SUCCESS;
3941
3942#ifdef SUPDRV_USE_TSC_DELTA_THREAD
3943 /*
3944 * Has the TSC already been measured and we're not forced to redo it?
3945 */
3946 if ( pGip->aCPUs[iGipCpu].i64TSCDelta != INT64_MAX
3947 && !(fFlags & SUP_TSCDELTA_MEASURE_F_FORCE))
3948 return VINF_SUCCESS;
3949
3950 /*
3951 * Asynchronous request? Forward it to the thread, no waiting.
3952 */
3953 if (fFlags & SUP_TSCDELTA_MEASURE_F_ASYNC)
3954 {
3955 /** @todo Async. doesn't implement options like retries, waiting. We'll need
3956 * to pass those options to the thread somehow and implement it in the
3957 * thread. Check if anyone uses/needs fAsync before implementing this. */
3958 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
3959 RTCpuSetAddByIndex(&pDevExt->TscDeltaCpuSet, iCpuSet);
3960 if ( pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_Listening
3961 || pDevExt->enmTscDeltaThreadState == kTscDeltaThreadState_Measuring)
3962 {
3963 pDevExt->enmTscDeltaThreadState = kTscDeltaThreadState_WaitAndMeasure;
3964 rc = VINF_SUCCESS;
3965 }
3966 else
3967 rc = VERR_THREAD_IS_DEAD;
3968 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
3969 RTThreadUserSignal(pDevExt->hTscDeltaThread);
3970 return VINF_SUCCESS;
3971 }
3972
3973 /*
3974 * If a TSC-delta measurement request is already being serviced by the thread,
3975 * wait 'cTries' times if a retry-timeout is provided, otherwise bail as busy.
3976 */
3977 msTsStartWait = RTTimeSystemMilliTS();
3978 for (iWaitLoop = 0;; iWaitLoop++)
3979 {
3980 uint64_t cMsElapsed;
3981 SUPDRVTSCDELTATHREADSTATE enmState;
3982 RTSpinlockAcquire(pDevExt->hTscDeltaSpinlock);
3983 enmState = pDevExt->enmTscDeltaThreadState;
3984 RTSpinlockRelease(pDevExt->hTscDeltaSpinlock);
3985
3986 if (enmState == kTscDeltaThreadState_Measuring)
3987 { /* Must wait, the thread is busy. */ }
3988 else if (enmState == kTscDeltaThreadState_WaitAndMeasure)
3989 { /* Must wait, this state only says what will happen next. */ }
3990 else if (enmState == kTscDeltaThreadState_Terminating)
3991 { /* Must wait, this state only says what should happen next. */ }
3992 else
3993 break; /* All other states, the thread is either idly listening or dead. */
3994
3995 /* Wait or fail. */
3996 if (cMsWaitThread == 0)
3997 return VERR_SUPDRV_TSC_DELTA_MEASUREMENT_BUSY;
3998 cMsElapsed = RTTimeSystemMilliTS() - msTsStartWait;
3999 if (cMsElapsed >= cMsWaitThread)
4000 return VERR_SUPDRV_TSC_DELTA_MEASUREMENT_BUSY;
4001
4002 rc = RTThreadSleep(RT_MIN((RTMSINTERVAL)(cMsWaitThread - cMsElapsed), RT_MIN(iWaitLoop + 1, 10)));
4003 if (rc == VERR_INTERRUPTED)
4004 return rc;
4005 }
4006#endif /* SUPDRV_USE_TSC_DELTA_THREAD */
4007
4008 /*
4009 * Try measure the TSC delta the given number of times.
4010 */
4011 for (;;)
4012 {
4013 /* Unless we're forced to measure the delta, check whether it's done already. */
4014 if ( !(fFlags & SUP_TSCDELTA_MEASURE_F_FORCE)
4015 && pGip->aCPUs[iGipCpu].i64TSCDelta != INT64_MAX)
4016 {
4017 rc = VINF_SUCCESS;
4018 break;
4019 }
4020
4021 /* Measure it. */
4022 rc = supdrvMeasureTscDeltaOne(pDevExt, iGipCpu);
4023 if (rc != VERR_SUPDRV_TSC_DELTA_MEASUREMENT_FAILED)
4024 {
4025 Assert(pGip->aCPUs[iGipCpu].i64TSCDelta != INT64_MAX || RT_FAILURE_NP(rc));
4026 break;
4027 }
4028
4029 /* Retry? */
4030 if (cTries <= 1)
4031 break;
4032 cTries--;
4033
4034 if (cMsWaitRetry)
4035 {
4036 rc = RTThreadSleep(cMsWaitRetry);
4037 if (rc == VERR_INTERRUPTED)
4038 break;
4039 }
4040 }
4041
4042 return rc;
4043}
4044
4045
4046/**
4047 * Service a TSC-delta measurement request.
4048 *
4049 * @returns VBox status code.
4050 * @param pDevExt Pointer to the device instance data.
4051 * @param pSession The support driver session.
4052 * @param pReq Pointer to the TSC-delta measurement request.
4053 */
4054int VBOXCALL supdrvIOCtl_TscDeltaMeasure(PSUPDRVDEVEXT pDevExt, PSUPDRVSESSION pSession, PSUPTSCDELTAMEASURE pReq)
4055{
4056 uint32_t cTries;
4057 uint32_t iCpuSet;
4058 uint32_t fFlags;
4059 RTMSINTERVAL cMsWaitRetry;
4060
4061 /*
4062 * Validate and adjust/resolve the input so they can be passed onto SUPR0TscDeltaMeasureBySetIndex.
4063 */
4064 AssertPtr(pDevExt); AssertPtr(pSession); AssertPtr(pReq); /* paranoia^2 */
4065
4066 if (pReq->u.In.idCpu == NIL_RTCPUID)
4067 return VERR_INVALID_CPU_ID;
4068 iCpuSet = RTMpCpuIdToSetIndex(pReq->u.In.idCpu);
4069 if (iCpuSet >= RTCPUSET_MAX_CPUS)
4070 return VERR_INVALID_CPU_ID;
4071
4072 cTries = pReq->u.In.cRetries == 0 ? 0 : (uint32_t)pReq->u.In.cRetries + 1;
4073
4074 cMsWaitRetry = RT_MAX(pReq->u.In.cMsWaitRetry, 5);
4075
4076 fFlags = 0;
4077 if (pReq->u.In.fAsync)
4078 fFlags |= SUP_TSCDELTA_MEASURE_F_ASYNC;
4079 if (pReq->u.In.fForce)
4080 fFlags |= SUP_TSCDELTA_MEASURE_F_FORCE;
4081
4082 return SUPR0TscDeltaMeasureBySetIndex(pSession, iCpuSet, fFlags, cMsWaitRetry,
4083 cTries == 0 ? 5*RT_MS_1SEC : cMsWaitRetry * cTries /*cMsWaitThread*/,
4084 cTries);
4085}
4086
4087
4088/**
4089 * Reads TSC with delta applied.
4090 *
4091 * Will try to resolve delta value INT64_MAX before applying it. This is the
4092 * main purpose of this function, to handle the case where the delta needs to be
4093 * determined.
4094 *
4095 * @returns VBox status code.
4096 * @param pDevExt Pointer to the device instance data.
4097 * @param pSession The support driver session.
4098 * @param pReq Pointer to the TSC-read request.
4099 */
4100int VBOXCALL supdrvIOCtl_TscRead(PSUPDRVDEVEXT pDevExt, PSUPDRVSESSION pSession, PSUPTSCREAD pReq)
4101{
4102 PSUPGLOBALINFOPAGE pGip;
4103 int rc;
4104
4105 /*
4106 * Validate. We require the client to have mapped GIP (no asserting on
4107 * ring-3 preconditions).
4108 */
4109 AssertPtr(pDevExt); AssertPtr(pReq); AssertPtr(pSession); /* paranoia^2 */
4110 if (pSession->GipMapObjR3 == NIL_RTR0MEMOBJ)
4111 return VERR_WRONG_ORDER;
4112 pGip = pDevExt->pGip;
4113 AssertReturn(pGip, VERR_INTERNAL_ERROR_2);
4114
4115 /*
4116 * We're usually here because we need to apply delta, but we shouldn't be
4117 * upset if the GIP is some different mode.
4118 */
4119 if (pGip->enmUseTscDelta > SUPGIPUSETSCDELTA_ZERO_CLAIMED)
4120 {
4121 uint32_t cTries = 0;
4122 for (;;)
4123 {
4124 /*
4125 * Start by gathering the data, using CLI for disabling preemption
4126 * while we do that.
4127 */
4128 RTCCUINTREG uFlags = ASMIntDisableFlags();
4129 int iCpuSet = RTMpCpuIdToSetIndex(RTMpCpuId());
4130 int iGipCpu;
4131 if (RT_LIKELY( (unsigned)iCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)
4132 && (iGipCpu = pGip->aiCpuFromCpuSetIdx[iCpuSet]) < pGip->cCpus ))
4133 {
4134 int64_t i64Delta = pGip->aCPUs[iGipCpu].i64TSCDelta;
4135 pReq->u.Out.idApic = pGip->aCPUs[iGipCpu].idApic;
4136 pReq->u.Out.u64AdjustedTsc = ASMReadTSC();
4137 ASMSetFlags(uFlags);
4138
4139 /*
4140 * If we're lucky we've got a delta, but no predicitions here
4141 * as this I/O control is normally only used when the TSC delta
4142 * is set to INT64_MAX.
4143 */
4144 if (i64Delta != INT64_MAX)
4145 {
4146 pReq->u.Out.u64AdjustedTsc -= i64Delta;
4147 rc = VINF_SUCCESS;
4148 break;
4149 }
4150
4151 /* Give up after a few times. */
4152 if (cTries >= 4)
4153 {
4154 rc = VWRN_SUPDRV_TSC_DELTA_MEASUREMENT_FAILED;
4155 break;
4156 }
4157
4158 /* Need to measure the delta an try again. */
4159 rc = supdrvMeasureTscDeltaOne(pDevExt, iGipCpu);
4160 Assert(pGip->aCPUs[iGipCpu].i64TSCDelta != INT64_MAX || RT_FAILURE_NP(rc));
4161 }
4162 else
4163 {
4164 /* This really shouldn't happen. */
4165 AssertMsgFailed(("idCpu=%#x iCpuSet=%#x (%d)\n", RTMpCpuId(), iCpuSet, iCpuSet));
4166 pReq->u.Out.idApic = ASMGetApicId();
4167 pReq->u.Out.u64AdjustedTsc = ASMReadTSC();
4168 ASMSetFlags(uFlags);
4169 rc = VERR_INTERNAL_ERROR_5; /** @todo change to warning. */
4170 break;
4171 }
4172 }
4173 }
4174 else
4175 {
4176 /*
4177 * No delta to apply. Easy. Deal with preemption the lazy way.
4178 */
4179 RTCCUINTREG uFlags = ASMIntDisableFlags();
4180 int iCpuSet = RTMpCpuIdToSetIndex(RTMpCpuId());
4181 int iGipCpu;
4182 if (RT_LIKELY( (unsigned)iCpuSet < RT_ELEMENTS(pGip->aiCpuFromCpuSetIdx)
4183 && (iGipCpu = pGip->aiCpuFromCpuSetIdx[iCpuSet]) < pGip->cCpus ))
4184 pReq->u.Out.idApic = pGip->aCPUs[iGipCpu].idApic;
4185 else
4186 pReq->u.Out.idApic = ASMGetApicId();
4187 pReq->u.Out.u64AdjustedTsc = ASMReadTSC();
4188 ASMSetFlags(uFlags);
4189 rc = VINF_SUCCESS;
4190 }
4191
4192 return rc;
4193}
4194
Note: See TracBrowser for help on using the repository browser.

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette