diff --git a/include/dpp/discordvoiceclient.h b/include/dpp/discordvoiceclient.h index 07e4364483..8ed58107d8 100644 --- a/include/dpp/discordvoiceclient.h +++ b/include/dpp/discordvoiceclient.h @@ -346,6 +346,17 @@ class DPP_EXPORT discord_voice_client : public websocket_client */ uint32_t timestamp; + /** + * @brief Each packet should have a nonce, a 32-bit incremental + * integer value appended to payload. + * + * We should keep track of this value and increment it for each + * packet sent. + * + * Current initial value is hardcoded to 1. + */ + uint32_t packet_nonce; + /** * @brief Last sent packet high-resolution timestamp */ diff --git a/src/dpp/discordvoiceclient.cpp b/src/dpp/discordvoiceclient.cpp index c28ce709b6..e46d4258a0 100644 --- a/src/dpp/discordvoiceclient.cpp +++ b/src/dpp/discordvoiceclient.cpp @@ -20,6 +20,7 @@ * ************************************************************************************/ +#include #include #ifdef _WIN32 #include @@ -316,6 +317,7 @@ discord_voice_client::discord_voice_client(dpp::cluster* _cluster, snowflake _ch secret_key(nullptr), sequence(0), timestamp(0), + packet_nonce(1), last_timestamp(std::chrono::high_resolution_clock::now()), sending(false), tracks(0), @@ -593,6 +595,9 @@ bool discord_voice_client::handle_frame(const std::string &data) rdy.voice_channel_id = this->channel_id; creator->on_voice_ready.call(rdy); } + + /* Reset packet_nonce */ + packet_nonce = 1; } break; /* Voice ready */ @@ -711,131 +716,157 @@ void discord_voice_client::read_ready() uint8_t buffer[65535]; int packet_size = this->udp_recv((char*)buffer, sizeof(buffer)); - if (packet_size > 0 && (!creator->on_voice_receive.empty() || !creator->on_voice_receive_combined.empty())) { - constexpr size_t header_size = 12; - if (static_cast(packet_size) < header_size) { - /* Invalid RTP payload */ - return; - } + bool receive_handler_is_empty = creator->on_voice_receive.empty() && creator->on_voice_receive_combined.empty(); + if (packet_size <= 0 || receive_handler_is_empty) { + /* Nothing to do */ + return; + } - /* It's a "silence packet" - throw it away. */ - if (packet_size < 44) { - return; - } + constexpr size_t header_size = 12; + if (static_cast(packet_size) < header_size) { + /* Invalid RTP payload */ + return; + } - if (uint8_t payload_type = buffer[1] & 0b0111'1111; - 72 <= payload_type && payload_type <= 76) { - /* - * This is an RTCP payload. Discord is known to send - * RTCP Receiver Reports. - * - * See https://datatracker.ietf.org/doc/html/rfc3551#section-6 - */ - return; - } + /* It's a "silence packet" - throw it away. */ + if (packet_size < 44) { + return; + } - voice_payload vp{0, // seq, populate later - 0, // timestamp, populate later - std::make_unique(nullptr, std::string((char*)buffer, packet_size))}; + if (uint8_t payload_type = buffer[1] & 0b0111'1111; + 72 <= payload_type && payload_type <= 76) { + /* + * This is an RTCP payload. Discord is known to send + * RTCP Receiver Reports. + * + * See https://datatracker.ietf.org/doc/html/rfc3551#section-6 + */ + return; + } - vp.vr->voice_client = this; + voice_payload vp{0, // seq, populate later + 0, // timestamp, populate later + std::make_unique(nullptr, std::string((char*)buffer, packet_size))}; - { /* Get the User ID of the speaker */ - uint32_t speaker_ssrc; - std::memcpy(&speaker_ssrc, &buffer[8], sizeof(uint32_t)); - speaker_ssrc = ntohl(speaker_ssrc); - vp.vr->user_id = ssrc_map[speaker_ssrc]; - } + vp.vr->voice_client = this; - /* Get the sequence number of the voice UDP packet */ - std::memcpy(&vp.seq, &buffer[2], sizeof(rtp_seq_t)); - vp.seq = ntohs(vp.seq); - /* Get the timestamp of the voice UDP packet */ - std::memcpy(&vp.timestamp, &buffer[4], sizeof(rtp_timestamp_t)); - vp.timestamp = ntohl(vp.timestamp); - - /* Nonce is the RTP Header with zero padding */ - uint8_t nonce[24] = { 0 }; - std::memcpy(nonce, &buffer[0], header_size); - - /* Get the number of CSRC in header */ - const size_t csrc_count = buffer[0] & 0b0000'1111; - /* Skip to the encrypted voice data */ - const ptrdiff_t offset_to_data = header_size + sizeof(uint32_t) * csrc_count; - uint8_t* encrypted_data = buffer + offset_to_data; - const size_t encrypted_data_len = packet_size - offset_to_data; - - if(crypto_aead_xchacha20poly1305_ietf_decrypt() != 0) - - if (crypto_secretbox_open_easy(encrypted_data, encrypted_data, - encrypted_data_len, nonce, secret_key)) { - /* Invalid Discord RTP payload. */ - return; - } + uint32_t speaker_ssrc; + { /* Get the User ID of the speaker */ + std::memcpy(&speaker_ssrc, &buffer[8], sizeof(uint32_t)); + speaker_ssrc = ntohl(speaker_ssrc); + vp.vr->user_id = ssrc_map[speaker_ssrc]; + } - const uint8_t* decrypted_data = encrypted_data; - size_t decrypted_data_len = encrypted_data_len - crypto_box_MACBYTES; - if ([[maybe_unused]] const bool uses_extension = (buffer[0] >> 4) & 0b0001) { - /* Skip the RTP Extensions */ - size_t ext_len = 0; - { - uint16_t ext_len_in_words; - memcpy(&ext_len_in_words, &decrypted_data[2], sizeof(uint16_t)); - ext_len_in_words = ntohs(ext_len_in_words); - ext_len = sizeof(uint32_t) * ext_len_in_words; - } - constexpr size_t ext_header_len = sizeof(uint16_t) * 2; - decrypted_data += ext_header_len + ext_len; - decrypted_data_len -= ext_header_len + ext_len; + /* Get the sequence number of the voice UDP packet */ + std::memcpy(&vp.seq, &buffer[2], sizeof(rtp_seq_t)); + vp.seq = ntohs(vp.seq); + /* Get the timestamp of the voice UDP packet */ + std::memcpy(&vp.timestamp, &buffer[4], sizeof(rtp_timestamp_t)); + vp.timestamp = ntohl(vp.timestamp); + + constexpr size_t nonce_size = sizeof(uint32_t); + /* Nonce is 4 byte at the end of payload with zero padding */ + uint8_t nonce[24] = { 0 }; + std::memcpy(nonce, buffer + packet_size - nonce_size, nonce_size); + + /* Get the number of CSRC in header */ + const size_t csrc_count = buffer[0] & 0b0000'1111; + /* Skip to the encrypted voice data */ + const ptrdiff_t offset_to_data = header_size + sizeof(uint32_t) * csrc_count; + size_t total_header_len = offset_to_data; + + uint8_t* ciphertext = buffer + offset_to_data; + size_t ciphertext_len = packet_size - offset_to_data - nonce_size; + + size_t ext_len = 0; + if ([[maybe_unused]] const bool uses_extension = (buffer[0] >> 4) & 0b0001) { + /** + * Get the RTP Extensions size, we only get the size here because + * the extension itself is encrypted along with the opus packet + */ + { + uint16_t ext_len_in_words; + memcpy(&ext_len_in_words, &ciphertext[2], sizeof(uint16_t)); + ext_len_in_words = ntohs(ext_len_in_words); + ext_len = sizeof(uint32_t) * ext_len_in_words; } + constexpr size_t ext_header_len = sizeof(uint16_t) * 2; + ciphertext += ext_header_len; + ciphertext_len -= ext_header_len; + total_header_len += ext_header_len; + } - /* - * We're left with the decrypted, opus-encoded data. - * Park the payload and decode on the voice courier thread. + uint8_t decrypted[65535] = { 0 }; + unsigned long long opus_packet_len = 0; + if (crypto_aead_xchacha20poly1305_ietf_decrypt( + decrypted, &opus_packet_len, + nullptr, + ciphertext, ciphertext_len, + buffer, + /** + * Additional Data: + * The whole header (including csrc list) + + * 4 byte extension header (magic 0xBEDE + 16-bit denoting extension length) */ - vp.vr->audio_data.assign(decrypted_data, decrypted_data + decrypted_data_len); + total_header_len, + nonce, secret_key) != 0) { + /* Invalid Discord RTP payload. */ + return; + } - { - std::lock_guard lk(voice_courier_shared_state.mtx); - auto& [range, payload_queue, pending_decoder_ctls, decoder] = voice_courier_shared_state.parked_voice_payloads[vp.vr->user_id]; + uint8_t *opus_packet = decrypted; + if (ext_len > 0) { + /* Skip previously encrypted RTP Header Extension */ + opus_packet += ext_len; + opus_packet_len -= ext_len; + } - if (!decoder) { - /* - * Most likely this is the first time we encounter this speaker. - * Do some initialization for not only the decoder but also the range. + /* + * We're left with the decrypted, opus-encoded data. + * Park the payload and decode on the voice courier thread. + */ + vp.vr->audio_data.assign(opus_packet, opus_packet + opus_packet_len); + + { + std::lock_guard lk(voice_courier_shared_state.mtx); + auto& [range, payload_queue, pending_decoder_ctls, decoder] = voice_courier_shared_state.parked_voice_payloads[vp.vr->user_id]; + + if (!decoder) { + /* + * Most likely this is the first time we encounter this speaker. + * Do some initialization for not only the decoder but also the range. + */ + range.min_seq = vp.seq; + range.min_timestamp = vp.timestamp; + + int opus_error = 0; + decoder.reset(opus_decoder_create(opus_sample_rate_hz, opus_channel_count, &opus_error), + &opus_decoder_destroy); + if (opus_error) { + /** + * NOTE: The -10 here makes the opus_error match up with values of exception_error_code, + * which would otherwise conflict as every C library loves to use values from -1 downwards. */ - range.min_seq = vp.seq; - range.min_timestamp = vp.timestamp; - - int opus_error = 0; - decoder.reset(opus_decoder_create(opus_sample_rate_hz, opus_channel_count, &opus_error), - &opus_decoder_destroy); - if (opus_error) { - /** - * NOTE: The -10 here makes the opus_error match up with values of exception_error_code, - * which would otherwise conflict as every C library loves to use values from -1 downwards. - */ - throw dpp::voice_exception((exception_error_code)(opus_error - 10), "discord_voice_client::discord_voice_client; opus_decoder_create() failed"); - } + throw dpp::voice_exception((exception_error_code)(opus_error - 10), "discord_voice_client::discord_voice_client; opus_decoder_create() failed"); } + } - if (vp.seq < range.min_seq && vp.timestamp < range.min_timestamp) { - /* This packet arrived too late. We can only discard it. */ - return; - } - range.max_seq = vp.seq; - range.max_timestamp = vp.timestamp; - payload_queue.push(std::move(vp)); + if (vp.seq < range.min_seq && vp.timestamp < range.min_timestamp) { + /* This packet arrived too late. We can only discard it. */ + return; } + range.max_seq = vp.seq; + range.max_timestamp = vp.timestamp; + payload_queue.push(std::move(vp)); + } - voice_courier_shared_state.signal_iteration.notify_one(); + voice_courier_shared_state.signal_iteration.notify_one(); - if (!voice_courier.joinable()) { - /* Courier thread is not running, start it */ - voice_courier = std::thread(&voice_courier_loop, - std::ref(*this), - std::ref(voice_courier_shared_state)); - } + if (!voice_courier.joinable()) { + /* Courier thread is not running, start it */ + voice_courier = std::thread(&voice_courier_loop, + std::ref(*this), + std::ref(voice_courier_shared_state)); } #else throw dpp::voice_exception(err_no_voice_support, "Voice support not enabled in this build of D++"); @@ -1244,13 +1275,13 @@ discord_voice_client& discord_voice_client::send_audio_raw(uint16_t* audio_data, return send_audio_raw((uint16_t*)packet.data(), packet.size()); } - opus_int32 encodedAudioMaxLength = (opus_int32)length; - std::vector encodedAudioData(encodedAudioMaxLength); - size_t encodedAudioLength = encodedAudioMaxLength; + opus_int32 encoded_audio_max_length = (opus_int32)length; + std::vector encoded_audio(encoded_audio_max_length); + size_t encoded_audio_length = encoded_audio_max_length; - encodedAudioLength = this->encode((uint8_t*)audio_data, length, encodedAudioData.data(), encodedAudioLength); + encoded_audio_length = this->encode((uint8_t*)audio_data, length, encoded_audio.data(), encoded_audio_length); - send_audio_opus(encodedAudioData.data(), encodedAudioLength); + send_audio_opus(encoded_audio.data(), encoded_audio_length); #else throw dpp::voice_exception(err_no_voice_support, "Voice support not enabled in this build of D++"); #endif @@ -1270,31 +1301,54 @@ discord_voice_client& discord_voice_client::send_audio_opus(uint8_t* opus_packet discord_voice_client& discord_voice_client::send_audio_opus(uint8_t* opus_packet, const size_t length, uint64_t duration) { #if HAVE_VOICE - int frameSize = (int)(48 * duration * (timescale / 1000000)); - opus_int32 encodedAudioMaxLength = (opus_int32)length; - std::vector encodedAudioData(encodedAudioMaxLength); - size_t encodedAudioLength = encodedAudioMaxLength; + int frame_size = (int)(48 * duration * (timescale / 1000000)); + opus_int32 encoded_audio_max_length = (opus_int32)length; + std::vector encoded_audio(encoded_audio_max_length); + size_t encoded_audio_length = encoded_audio_max_length; - encodedAudioLength = length; - encodedAudioData.reserve(length); - memcpy(encodedAudioData.data(), opus_packet, length); + encoded_audio_length = length; + encoded_audio.reserve(length); + memcpy(encoded_audio.data(), opus_packet, length); ++sequence; rtp_header header(sequence, timestamp, (uint32_t)ssrc); - std::vector audioDataPacket(sizeof(header) + encodedAudioLength + crypto_secretbox_MACBYTES); - std::memcpy(audioDataPacket.data(), &header, sizeof(header)); + /* Expected payload size is unencrypted header + encrypted opus packet + unencrypted 32 bit nonce */ + size_t packet_siz = sizeof(header) + (encoded_audio_length + crypto_aead_xchacha20poly1305_IETF_ABYTES) + sizeof(packet_nonce); + + std::vector payload(packet_siz); + + /* Set RTP header */ + std::memcpy(payload.data(), &header, sizeof(header)); + + /* Convert nonce to big-endian */ + uint32_t noncel = htonl(packet_nonce); + + /* 24 byte is needed for encrypting, discord just want 4 byte so just fill up the rest with null */ + unsigned char encrypt_nonce[crypto_aead_xchacha20poly1305_ietf_NPUBBYTES] = { '\0' }; + memcpy(encrypt_nonce, &noncel, sizeof(noncel)); - unsigned char nonce[crypto_aead_xchacha20poly1305_ietf_NPUBBYTES]; - randombytes_buf(nonce, sizeof nonce); + /* Execute */ + crypto_aead_xchacha20poly1305_ietf_encrypt( + payload.data() + sizeof(header), + nullptr, + encoded_audio.data(), + encoded_audio_length, + /* The RTP Header as Additional Data */ + reinterpret_cast(&header), + sizeof(header), + nullptr, + static_cast(encrypt_nonce), + secret_key); - unsigned long long clen_p; - crypto_aead_xchacha20poly1305_ietf_encrypt(audioDataPacket.data() + sizeof(header), &clen_p, encodedAudioData.data(), encodedAudioLength, NULL, NULL, NULL, (const unsigned char*)nonce, secret_key); + /* Append the 4 byte nonce to the resulting payload */ + std::memcpy(payload.data() + payload.size() - sizeof(noncel), &noncel, sizeof(noncel)); - //crypto_secretbox_easy(audioDataPacket.data() + sizeof(header), encodedAudioData.data(), encodedAudioLength, (const unsigned char*)nonce, secret_key); + this->send(reinterpret_cast(payload.data()), payload.size(), duration); + timestamp += frame_size; - this->send((const char*)audioDataPacket.data(), audioDataPacket.size(), duration); - timestamp += frameSize; + /* Increment for next packet */ + packet_nonce++; speak(); #else