Fix for audio drift issues with fragmented mp4 caused by inaccurate t…

…imestamps and variable framerate.
sskodje · Jan 19, 2023 · 8fc241d · 8fc241d
1 parent 58ab97e
commit 8fc241d
Show file tree

Hide file tree

Showing 9 changed files with 266 additions and 203 deletions.
diff --git a/ScreenRecorderLibNative/AudioManager.cpp b/ScreenRecorderLibNative/AudioManager.cpp
@@ -72,7 +72,7 @@ HRESULT AudioManager::InitializeAudioCapture()
 	return hr;
 }
 
-std::vector<BYTE> AudioManager::GrabAudioFrame(_In_ int byteCount)
+std::vector<BYTE> AudioManager::GrabAudioFrame(_In_ int durationHundredNanos)
 {
 	EnterCriticalSection(&m_CriticalSection);
 	LeaveCriticalSectionOnExit leaveOnExit(&m_CriticalSection);
@@ -98,8 +98,8 @@ std::vector<BYTE> AudioManager::GrabAudioFrame(_In_ int byteCount)
 			}
 		};
 
-		std::vector<BYTE> outputDeviceData = m_LoopbackCaptureOutputDevice->GetRecordedBytes(byteCount);
-		std::vector<BYTE> inputDeviceData = m_LoopbackCaptureInputDevice->GetRecordedBytes(byteCount);
+		std::vector<BYTE> outputDeviceData = m_LoopbackCaptureOutputDevice->GetRecordedBytes(durationHundredNanos);
+		std::vector<BYTE> inputDeviceData = m_LoopbackCaptureInputDevice->GetRecordedBytes(durationHundredNanos);
 		returnAudioOverflowToBuffer(outputDeviceData, inputDeviceData);
 		if (inputDeviceData.size() > 0 && outputDeviceData.size() && inputDeviceData.size() != outputDeviceData.size()) {
 			LOG_ERROR(L"Mixing audio byte arrays with differing sizes");
@@ -108,9 +108,9 @@ std::vector<BYTE> AudioManager::GrabAudioFrame(_In_ int byteCount)
 		return std::move(MixAudio(outputDeviceData, inputDeviceData, GetAudioOptions()->GetOutputVolume(), GetAudioOptions()->GetInputVolume()));
 	}
 	else if (m_LoopbackCaptureOutputDevice)
-		return std::move(MixAudio(m_LoopbackCaptureOutputDevice->GetRecordedBytes(byteCount), std::vector<BYTE>(), GetAudioOptions()->GetOutputVolume(), 1.0));
+		return std::move(MixAudio(m_LoopbackCaptureOutputDevice->GetRecordedBytes(durationHundredNanos), std::vector<BYTE>(), GetAudioOptions()->GetOutputVolume(), 1.0));
 	else if (m_LoopbackCaptureInputDevice)
-		return std::move(MixAudio(std::vector<BYTE>(), m_LoopbackCaptureInputDevice->GetRecordedBytes(byteCount), 1.0, GetAudioOptions()->GetInputVolume()));
+		return std::move(MixAudio(std::vector<BYTE>(), m_LoopbackCaptureInputDevice->GetRecordedBytes(durationHundredNanos), 1.0, GetAudioOptions()->GetInputVolume()));
 	else
 		return std::vector<BYTE>();
 }

diff --git a/ScreenRecorderLibNative/AudioManager.h b/ScreenRecorderLibNative/AudioManager.h
@@ -9,7 +9,7 @@ class AudioManager
 	~AudioManager();
 	HRESULT Initialize(_In_ std::shared_ptr<AUDIO_OPTIONS> &audioOptions);
 	void ClearRecordedBytes();
-	std::vector<BYTE> GrabAudioFrame(_In_ int byteCount);
+	std::vector<BYTE> GrabAudioFrame(_In_ int durationHundredNanos);
 private:
 	CRITICAL_SECTION m_CriticalSection;
 	std::shared_ptr<AUDIO_OPTIONS> m_AudioOptions;

diff --git a/ScreenRecorderLibNative/LoopbackCapture.cpp b/ScreenRecorderLibNative/LoopbackCapture.cpp
@@ -73,34 +73,34 @@ HRESULT LoopbackCapture::StartLoopbackCapture(
 		// can do this in-place since we're not changing the size of the format
 		// also, the engine will auto-convert from float to int for us
 		switch (pwfx->wFormatTag) {
-		case WAVE_FORMAT_IEEE_FLOAT:
-			pwfx->wFormatTag = WAVE_FORMAT_PCM;
-			pwfx->wBitsPerSample = 16;
-			pwfx->nBlockAlign = pwfx->nChannels * pwfx->wBitsPerSample / 8;
-			pwfx->nAvgBytesPerSec = pwfx->nBlockAlign * pwfx->nSamplesPerSec;
-			break;
-
-		case WAVE_FORMAT_EXTENSIBLE:
-		{
-			// naked scope for case-local variable
-			PWAVEFORMATEXTENSIBLE pEx = reinterpret_cast<PWAVEFORMATEXTENSIBLE>(pwfx);
-			if (IsEqualGUID(KSDATAFORMAT_SUBTYPE_IEEE_FLOAT, pEx->SubFormat)) {
-				pEx->SubFormat = KSDATAFORMAT_SUBTYPE_PCM;
-				pEx->Samples.wValidBitsPerSample = 16;
+			case WAVE_FORMAT_IEEE_FLOAT:
+				pwfx->wFormatTag = WAVE_FORMAT_PCM;
 				pwfx->wBitsPerSample = 16;
 				pwfx->nBlockAlign = pwfx->nChannels * pwfx->wBitsPerSample / 8;
 				pwfx->nAvgBytesPerSec = pwfx->nBlockAlign * pwfx->nSamplesPerSec;
+				break;
+
+			case WAVE_FORMAT_EXTENSIBLE:
+			{
+				// naked scope for case-local variable
+				PWAVEFORMATEXTENSIBLE pEx = reinterpret_cast<PWAVEFORMATEXTENSIBLE>(pwfx);
+				if (IsEqualGUID(KSDATAFORMAT_SUBTYPE_IEEE_FLOAT, pEx->SubFormat)) {
+					pEx->SubFormat = KSDATAFORMAT_SUBTYPE_PCM;
+					pEx->Samples.wValidBitsPerSample = 16;
+					pwfx->wBitsPerSample = 16;
+					pwfx->nBlockAlign = pwfx->nChannels * pwfx->wBitsPerSample / 8;
+					pwfx->nAvgBytesPerSec = pwfx->nBlockAlign * pwfx->nSamplesPerSec;
+				}
+				else {
+					LOG_ERROR(L"%s", L"Don't know how to coerce mix format to int-16");
+					return E_UNEXPECTED;
+				}
 			}
-			else {
-				LOG_ERROR(L"%s", L"Don't know how to coerce mix format to int-16");
-				return E_UNEXPECTED;
-			}
-		}
-		break;
+			break;
 
-		default:
-			LOG_ERROR(L"Don't know how to coerce WAVEFORMATEX with wFormatTag = 0x%08x to int-16", pwfx->wFormatTag);
-			return E_UNEXPECTED;
+			default:
+				LOG_ERROR(L"Don't know how to coerce WAVEFORMATEX with wFormatTag = 0x%08x to int-16", pwfx->wFormatTag);
+				return E_UNEXPECTED;
 		}
 	}
 	UINT32 outputSampleRate;
@@ -166,15 +166,15 @@ HRESULT LoopbackCapture::StartLoopbackCapture(
 	// so we're going to do a timer-driven loop
 	switch (flow)
 	{
-	case eRender:
-		hr = pAudioClient->Initialize(AUDCLNT_SHAREMODE_SHARED, AUDCLNT_STREAMFLAGS_LOOPBACK, audioClientBuffer, 0, pwfx, 0);
-		break;
-	case eCapture:
-		hr = pAudioClient->Initialize(AUDCLNT_SHAREMODE_SHARED, 0, audioClientBuffer, 0, pwfx, 0);
-		break;
-	default:
-		hr = pAudioClient->Initialize(AUDCLNT_SHAREMODE_SHARED, AUDCLNT_STREAMFLAGS_LOOPBACK, audioClientBuffer, 0, pwfx, 0);
-		break;
+		case eRender:
+			hr = pAudioClient->Initialize(AUDCLNT_SHAREMODE_SHARED, AUDCLNT_STREAMFLAGS_LOOPBACK, audioClientBuffer, 0, pwfx, 0);
+			break;
+		case eCapture:
+			hr = pAudioClient->Initialize(AUDCLNT_SHAREMODE_SHARED, 0, audioClientBuffer, 0, pwfx, 0);
+			break;
+		default:
+			hr = pAudioClient->Initialize(AUDCLNT_SHAREMODE_SHARED, AUDCLNT_STREAMFLAGS_LOOPBACK, audioClientBuffer, 0, pwfx, 0);
+			break;
 	}
 	if (FAILED(hr)) {
 		LOG_ERROR(L"IAudioClient::Initialize failed on %ls: hr = 0x%08x", m_Tag.c_str(), hr);
@@ -351,11 +351,11 @@ std::vector<BYTE> LoopbackCapture::PeakRecordedBytes()
 	return m_RecordedBytes;
 }
 
-std::vector<BYTE> LoopbackCapture::GetRecordedBytes(int byteCount)
+std::vector<BYTE> LoopbackCapture::GetRecordedBytes(int duration100Nanos)
 {
-	byteCount = min(byteCount, m_RecordedBytes.size());
+	int frameCount = int(ceil(m_InputFormat.sampleRate * HundredNanosToSeconds(duration100Nanos)));
+	int byteCount = min((frameCount * m_InputFormat.FrameBytes()), m_RecordedBytes.size());
 	m_TaskWrapperImpl->m_Mutex.lock();
-
 	std::vector<BYTE> newvector(m_RecordedBytes.begin(), m_RecordedBytes.begin() + byteCount);
 	// convert audio
 	if (requiresResampling() && byteCount > 0) {
@@ -405,13 +405,13 @@ HRESULT LoopbackCapture::StartCapture(UINT32 sampleRate, UINT32 audioChannels, s
 		auto file = prefs.m_hFile;
 		m_TaskWrapperImpl->m_CaptureTask = concurrency::create_task([this, flow, sampleRate, audioChannels, device, file]() {
 			if (FAILED(StartLoopbackCapture(device,
-				file,
-				true,
-				m_CaptureStartedEvent,
-				m_CaptureStopEvent,
-				flow,
-				sampleRate,
-				audioChannels))) {
+			file,
+			true,
+			m_CaptureStartedEvent,
+			m_CaptureStopEvent,
+			flow,
+			sampleRate,
+			audioChannels))) {
 				SetEvent(m_CaptureStopEvent);
 			}
 			});

diff --git a/ScreenRecorderLibNative/LoopbackCapture.h b/ScreenRecorderLibNative/LoopbackCapture.h
@@ -32,7 +32,7 @@ class LoopbackCapture
 		UINT32 channels
 	);
 	std::vector<BYTE> PeakRecordedBytes();
-	std::vector<BYTE> GetRecordedBytes(int byteCount);
+	std::vector<BYTE> GetRecordedBytes(int duration100Nanos);
 	HRESULT StartCapture(UINT32 audioChannels, std::wstring device, EDataFlow flow) { return StartCapture(0, audioChannels, device, flow); }
 	HRESULT StartCapture(UINT32 sampleRate, UINT32 audioChannels, std::wstring device, EDataFlow flow);
 	HRESULT StopCapture();

diff --git a/ScreenRecorderLibNative/OutputManager.cpp b/ScreenRecorderLibNative/OutputManager.cpp
@@ -11,6 +11,8 @@ using namespace concurrency;
 OutputManager::OutputManager() :
 	m_Device(nullptr),
 	m_DeviceContext(nullptr),
+	m_PresentationClock(nullptr),
+	m_TimeSrc(nullptr),
 	m_CallBack(nullptr),
 	m_FinalizeEvent(nullptr),
 	m_SinkWriter(nullptr),
@@ -66,6 +68,13 @@ HRESULT OutputManager::Initialize(
 	if (m_MediaTransform) {
 		m_MediaTransform->ProcessMessage(MFT_MESSAGE_COMMAND_FLUSH, 0);
 	}
+	if (!m_TimeSrc) {
+		RETURN_ON_BAD_HR(MFCreateSystemTimeSource(&m_TimeSrc));
+	}
+	if (!m_PresentationClock) {
+		RETURN_ON_BAD_HR(MFCreatePresentationClock(&m_PresentationClock));
+		RETURN_ON_BAD_HR(m_PresentationClock->SetTimeSource(m_TimeSrc));
+	}
 	RETURN_ON_BAD_HR(m_DeviceManager->ResetDevice(pDevice, m_ResetToken));
 	return S_OK;
 }
@@ -91,6 +100,8 @@ HRESULT OutputManager::BeginRecording(_In_ std::wstring outputPath, _In_ SIZE vi
 		RETURN_ON_BAD_HR(MFCreateFile(MF_ACCESSMODE_READWRITE, MF_OPENMODE_FAIL_IF_EXIST, MF_FILEFLAGS_NONE, outputPath.c_str(), &mfByteStream));
 		RETURN_ON_BAD_HR(hr = InitializeVideoSinkWriter(mfByteStream, m_Device, inputMediaFrameRect, videoOutputFrameSize, DXGI_MODE_ROTATION_UNSPECIFIED, m_CallBack, &m_SinkWriter, &m_VideoStreamIndex, &m_AudioStreamIndex));
 	}
+	StartMediaClock();
+	LOG_DEBUG("Sink Writer initialized");
 	return hr;
 }
 
@@ -113,6 +124,8 @@ HRESULT OutputManager::BeginRecording(_In_ IStream *pStream, _In_ SIZE videoOutp
 		RECT inputMediaFrameRect = RECT{ 0,0,videoOutputFrameSize.cx,videoOutputFrameSize.cy };
 		RETURN_ON_BAD_HR(hr = InitializeVideoSinkWriter(mfByteStream, m_Device, inputMediaFrameRect, videoOutputFrameSize, DXGI_MODE_ROTATION_UNSPECIFIED, m_CallBack, &m_SinkWriter, &m_VideoStreamIndex, &m_AudioStreamIndex));
 	}
+	StartMediaClock();
+	LOG_DEBUG("Sink Writer initialized");
 	return hr;
 }
 
@@ -159,6 +172,7 @@ HRESULT OutputManager::FinalizeRecording()
 			}
 		}
 	}
+	StopMediaClock();
 	return finalizeResult;
 }
 
@@ -256,23 +270,58 @@ void OutputManager::WriteTextureToImageAsync(_In_ ID3D11Texture2D *pAcquiredDesk
 	   }).then([this, filePath, pAcquiredDesktopImage, onCompletion](concurrency::task<HRESULT> t)
 		   {
 			   HRESULT hr;
-			   try {
-				   hr = t.get();
-				   // if .get() didn't throw and the HRESULT succeeded, there are no errors.
-			   }
-			   catch (const exception &e) {
-				   // handle error
-				   LOG_ERROR(L"Exception saving snapshot: %s", e.what());
-				   hr = E_FAIL;
-			   }
-			   pAcquiredDesktopImage->Release();
-			   if (onCompletion) {
-				   std::invoke(onCompletion, hr);
-			   }
-			   return hr;
+	   try {
+		   hr = t.get();
+		   // if .get() didn't throw and the HRESULT succeeded, there are no errors.
+	   }
+	   catch (const exception &e) {
+		   // handle error
+		   LOG_ERROR(L"Exception saving snapshot: %s", e.what());
+		   hr = E_FAIL;
+	   }
+	   pAcquiredDesktopImage->Release();
+	   if (onCompletion) {
+		   std::invoke(onCompletion, hr);
+	   }
+	   return hr;
 		   });
 }
 
+HRESULT OutputManager::StartMediaClock()
+{
+	return m_PresentationClock->Start(0);
+}
+HRESULT OutputManager::ResumeMediaClock()
+{
+	return m_PresentationClock->Start(PRESENTATION_CURRENT_POSITION);
+}
+HRESULT OutputManager::PauseMediaClock()
+{
+	return m_PresentationClock->Pause();
+}
+HRESULT OutputManager::StopMediaClock()
+{
+	return m_PresentationClock->Stop();
+}
+
+bool OutputManager::isMediaClockRunning()
+{
+	MFCLOCK_STATE state;
+	m_PresentationClock->GetState(0, &state);
+	return state == MFCLOCK_STATE_RUNNING;
+}
+
+bool OutputManager::isMediaClockPaused()
+{
+	MFCLOCK_STATE state;
+	m_PresentationClock->GetState(0, &state);
+	return state == MFCLOCK_STATE_PAUSED;
+}
+
+HRESULT OutputManager::GetMediaTimeStamp(_Out_ INT64 *pTime)
+{
+	return m_PresentationClock->GetTime(pTime);
+}
 
 HRESULT OutputManager::ConfigureOutputMediaTypes(
 	_In_ UINT destWidth,
@@ -338,7 +387,7 @@ HRESULT OutputManager::ConfigureInputMediaTypes(
 	RETURN_ON_BAD_HR(pVideoMediaType->SetUINT32(MF_MT_VIDEO_CHROMA_SITING, MFVideoChromaSubsampling_ProgressiveChroma));
 	RETURN_ON_BAD_HR(pVideoMediaType->SetUINT32(MF_MT_VIDEO_NOMINAL_RANGE, MFNominalRange_0_255));
 	RETURN_ON_BAD_HR(MFSetAttributeSize(pVideoMediaType, MF_MT_FRAME_SIZE, sourceWidth, sourceHeight));
-	if (!GetEncoderOptions()->GetIsFixedFramerate()) {
+	if (!GetEncoderOptions()->GetIsFixedFramerate() && !GetEncoderOptions()->GetIsFragmentedMp4Enabled()) {
 		RETURN_ON_BAD_HR(MFSetAttributeRatio(pVideoMediaType, MF_MT_FRAME_RATE, GetEncoderOptions()->GetVideoFps(), 1));
 	}
 	RETURN_ON_BAD_HR(MFSetAttributeRatio(pVideoMediaType, MF_MT_PIXEL_ASPECT_RATIO, 1, 1));
@@ -464,9 +513,9 @@ HRESULT OutputManager::InitializeVideoSinkWriter(
 	auto SetAttributeU32([](_Inout_ CComPtr<ICodecAPI> &codec, _In_ const GUID &guid, _In_ UINT32 value)
 	{
 		VARIANT val;
-		val.vt = VT_UI4;
-		val.uintVal = value;
-		return codec->SetValue(&guid, &val);
+	val.vt = VT_UI4;
+	val.uintVal = value;
+	return codec->SetValue(&guid, &val);
 	});
 
 	CComPtr<ICodecAPI> encoder = nullptr;
@@ -594,7 +643,7 @@ HRESULT OutputManager::WriteFrameToVideo(_In_ INT64 frameStartPos, _In_ INT64 fr
 	SafeRelease(&p2DBuffer);
 	SafeRelease(&pMediaBuffer);
 	return hr;
-	}
+}
 
 HRESULT OutputManager::WriteAudioSamplesToVideo(_In_ INT64 frameStartPos, _In_ INT64 frameDuration, _In_ DWORD streamIndex, _In_ BYTE *pSrc, _In_ DWORD cbData)
 {

diff --git a/ScreenRecorderLibNative/OutputManager.h b/ScreenRecorderLibNative/OutputManager.h
@@ -40,10 +40,20 @@ class OutputManager
 	void WriteTextureToImageAsync(_In_ ID3D11Texture2D *pAcquiredDesktopImage, _In_ std::wstring filePath, _In_opt_ std::function<void(HRESULT)> onCompletion = nullptr);
 	inline nlohmann::fifo_map<std::wstring, int> GetFrameDelays() { return m_FrameDelays; }
 	inline UINT64 GetRenderedFrameCount() { return m_RenderedFrameCount; }
+	HRESULT StartMediaClock();
+	HRESULT ResumeMediaClock();
+	HRESULT PauseMediaClock();
+	HRESULT StopMediaClock();
+	HRESULT GetMediaTimeStamp(_Out_ INT64 *pTime);
+	bool isMediaClockRunning();
+	bool isMediaClockPaused();
 private:
 	ID3D11DeviceContext *m_DeviceContext = nullptr;
 	ID3D11Device *m_Device = nullptr;
 
+	CComPtr<IMFPresentationTimeSource> m_TimeSrc;
+	CComPtr<IMFPresentationClock> m_PresentationClock;
+
 	std::shared_ptr<ENCODER_OPTIONS> m_EncoderOptions;
 	std::shared_ptr<AUDIO_OPTIONS> m_AudioOptions;
 	std::shared_ptr<SNAPSHOT_OPTIONS> m_SnapshotOptions;