obs-studio/plugins/obs-outputs/mp4-mux.c

2848 lines
72 KiB
C

/******************************************************************************
Copyright (C) 2024 by Dennis Sädtler <dennis@obsproject.com>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
******************************************************************************/
#include "mp4-mux-internal.h"
#include "rtmp-hevc.h"
#include "rtmp-av1.h"
#include <obs-avc.h>
#include <obs-hevc.h>
#include <obs-module.h>
#include <util/dstr.h>
#include <util/platform.h>
#include <util/array-serializer.h>
#include <time.h>
/*
* (Mostly) compliant MP4 muxer for fun and profit.
* Based on ISO/IEC 14496-12 and FFmpeg's libavformat/movenc.c ([L]GPL)
*
* Specification section numbers are noted where applicable.
* Standard identifier is included if not referring to ISO/IEC 14496-12.
*/
#define do_log(level, format, ...) \
blog(level, "[mp4 muxer: '%s'] " format, \
obs_output_get_name(mux->output), ##__VA_ARGS__)
#define warn(format, ...) do_log(LOG_WARNING, format, ##__VA_ARGS__)
#define info(format, ...) do_log(LOG_INFO, format, ##__VA_ARGS__)
/* Helper to overwrite placeholder size and return total size. */
static inline size_t write_box_size(struct serializer *s, int64_t start)
{
int64_t end = serializer_get_pos(s);
size_t size = end - start;
serializer_seek(s, start, SERIALIZE_SEEK_START);
s_wb32(s, (uint32_t)size);
serializer_seek(s, end, SERIALIZE_SEEK_START);
return size;
}
/// 4.2 Box header with size and char[4] name
static inline void write_box(struct serializer *s, const size_t size,
const char name[4])
{
if (size <= UINT32_MAX) {
s_wb32(s, (uint32_t)size); // size
s_write(s, name, 4); // boxtype
} else {
s_wb32(s, 1); // size
s_write(s, name, 4); // boxtype
s_wb64(s, size); // largesize
}
}
/// 4.2 FullBox extended header with u8 version and u24 flags
static inline void write_fullbox(struct serializer *s, const size_t size,
const char name[4], uint8_t version,
uint32_t flags)
{
write_box(s, size, name);
s_w8(s, version);
s_wb24(s, flags);
}
/// 4.3 File Type Box
static size_t mp4_write_ftyp(struct mp4_mux *mux, bool fragmented)
{
struct serializer *s = mux->serializer;
int64_t start = serializer_get_pos(s);
write_box(s, 0, "ftyp");
const char *major_brand = "isom";
/* Following FFmpeg's example, when using negative CTS the major brand
* needs to be either iso4 or iso6 depending on whether the file is
* currently fragmented. */
if (mux->flags & MP4_USE_NEGATIVE_CTS)
major_brand = fragmented ? "iso6" : "iso4";
s_write(s, major_brand, 4); // major brand
s_wb32(s, 512); // minor version
// minor brands (first one matches major brand)
s_write(s, major_brand, 4);
/* Write isom base brand if it's not the major brand */
if (strcmp(major_brand, "isom") != 0)
s_write(s, "isom", 4);
/* Avoid adding newer brand (iso6) unless necessary, use "obs1" brand
* as a placeholder to maintain ftyp box size. */
if (fragmented && strcmp(major_brand, "iso6") != 0)
s_write(s, "iso6", 4);
else
s_write(s, "obs1", 4);
s_write(s, "iso2", 4);
/* Include H.264 brand if used */
for (size_t i = 0; i < mux->tracks.num; i++) {
struct mp4_track *track = &mux->tracks.array[i];
if (track->type == TRACK_VIDEO) {
if (track->codec == CODEC_H264)
s_write(s, "avc1", 4);
break;
}
}
/* General MP4 brannd */
s_write(s, "mp41", 4);
return write_box_size(s, start);
}
/// 8.1.2 Free Space Box
static size_t mp4_write_free(struct mp4_mux *mux)
{
struct serializer *s = mux->serializer;
/* Write a 16-byte free box, so it can be replaced with a 64-bit size
* box header (u32 + char[4] + u64) */
s_wb32(s, 16);
s_write(s, "free", 4);
s_wb64(s, 0);
return 16;
}
/// 8.2.2 Movie Header Box
static size_t mp4_write_mvhd(struct mp4_mux *mux)
{
struct serializer *s = mux->serializer;
size_t start = serializer_get_pos(s);
/* Use primary video track as the baseline for duration */
uint64_t duration = 0;
for (size_t i = 0; i < mux->tracks.num; i++) {
struct mp4_track *track = &mux->tracks.array[i];
if (track->type == TRACK_VIDEO) {
duration = util_mul_div64(track->duration, 1000,
track->timebase_den);
break;
}
}
write_fullbox(s, 0, "mvhd", 0, 0);
if (duration > UINT32_MAX || mux->creation_time > UINT32_MAX) {
s_wb64(s, mux->creation_time); // creation time
s_wb64(s, mux->creation_time); // modification time
s_wb32(s, 1000); // timescale
s_wb64(s, duration); // duration (0 for fragmented)
} else {
s_wb32(s, (uint32_t)mux->creation_time); // creation time
s_wb32(s, (uint32_t)mux->creation_time); // modification time
s_wb32(s, 1000); // timescale
s_wb32(s, (uint32_t)duration); // duration (0 for fragmented)
}
s_wb32(s, 0x00010000); // rate, 16.16 fixed float (1 << 16)
s_wb16(s, 0x0100); // volume
s_wb16(s, 0); // reserved
s_wb32(s, 0); // reserved
s_wb32(s, 0); // reserved
// Matrix
for (int i = 0; i < 9; i++)
s_wb32(s, UNITY_MATRIX[i]);
// pre_defined
s_wb32(s, 0);
s_wb32(s, 0);
s_wb32(s, 0);
s_wb32(s, 0);
s_wb32(s, 0);
s_wb32(s, 0);
s_wb32(s, mux->track_ctr + 1); // next_track_ID
return write_box_size(s, start);
}
/// 8.3.2 Track Header Box
static size_t mp4_write_tkhd(struct mp4_mux *mux, struct mp4_track *track)
{
struct serializer *s = mux->serializer;
size_t start = serializer_get_pos(s);
uint64_t duration =
util_mul_div64(track->duration, 1000, track->timebase_den);
/* Flags are 0x1 (enabled) | 0x2 (in movie) */
static const uint32_t flags = 0x1 | 0x2;
write_fullbox(s, 0, "tkhd", 0, flags);
if (duration > UINT32_MAX || mux->creation_time > UINT32_MAX) {
s_wb64(s, mux->creation_time); // creation time
s_wb64(s, mux->creation_time); // modification time
s_wb32(s, track->track_id); // track_id
s_wb32(s, 0); // reserved
s_wb64(s, duration); // duration in movie timescale
} else {
s_wb32(s, (uint32_t)mux->creation_time); // creation time
s_wb32(s, (uint32_t)mux->creation_time); // modification time
s_wb32(s, track->track_id); // track_id
s_wb32(s, 0); // reserved
s_wb32(s, (uint32_t)duration); // duration in movie timescale
}
s_wb32(s, 0); // reserved
s_wb32(s, 0); // reserved
s_wb16(s, 0); // layer
s_wb16(s, track->type == TRACK_AUDIO ? 1 : 0); // alternate group
s_wb16(s, track->type == TRACK_AUDIO ? 0x100 : 0); // volume
s_wb16(s, 0); // reserved
// Matrix (predefined)
for (int i = 0; i < 9; i++)
s_wb32(s, UNITY_MATRIX[i]);
if (track->type == TRACK_AUDIO) {
s_wb32(s, 0); // width
s_wb32(s, 0); // height
} else {
/* width/height are fixed point 16.16, so we just shift the
* integer to the upper 16 bits */
uint32_t width = obs_encoder_get_width(track->encoder);
s_wb32(s, width << 16);
uint32_t height = obs_encoder_get_height(track->encoder);
s_wb32(s, height << 16);
}
return write_box_size(s, start);
}
/// 8.4.2 Media Header Box
static size_t mp4_write_mdhd(struct mp4_mux *mux, struct mp4_track *track)
{
struct serializer *s = mux->serializer;
size_t size = 32;
uint8_t version = 0;
uint64_t duration = track->duration;
uint32_t timescale = track->timescale;
if (track->type == TRACK_VIDEO) {
/* Update to track timescale */
duration = util_mul_div64(duration, track->timescale,
track->timebase_den);
}
/* use 64-bit duration if necessary */
if (duration > UINT32_MAX || mux->creation_time > UINT32_MAX) {
size = 44;
version = 1;
}
write_fullbox(s, size, "mdhd", version, 0);
if (version == 1) {
s_wb64(s, mux->creation_time); // creation time
s_wb64(s, mux->creation_time); // modification time
s_wb32(s, timescale); // timescale
s_wb64(s, (uint32_t)duration); // duration
} else {
s_wb32(s, (uint32_t)mux->creation_time); // creation time
s_wb32(s, (uint32_t)mux->creation_time); // modification time
s_wb32(s, timescale); // timescale
s_wb32(s, (uint32_t)duration); // duration
}
s_wb16(s, 21956); // language (undefined)
s_wb16(s, 0); // pre_defined
return size;
}
/// 8.4.3 Handler Reference Box
static size_t mp4_write_hdlr(struct mp4_mux *mux, struct mp4_track *track)
{
struct serializer *s = mux->serializer;
int64_t start = serializer_get_pos(s);
write_fullbox(s, 0, "hdlr", 0, 0);
s_wb32(s, 0); // pre_defined
// handler_type
if (track->type == TRACK_VIDEO)
s_write(s, "vide", 4);
else if (track->type == TRACK_CHAPTERS)
s_write(s, "text", 4);
else
s_write(s, "soun", 4);
s_wb32(s, 0); // reserved
s_wb32(s, 0); // reserved
s_wb32(s, 0); // reserved
// name (utf-8 string, null terminated)
if (track->type == TRACK_VIDEO)
s_write(s, "OBS Video Handler", 18);
else if (track->type == TRACK_CHAPTERS)
s_write(s, "OBS Chapter Handler", 20);
else
s_write(s, "OBS Audio Handler", 18);
return write_box_size(s, start);
}
/// 12.1.2 Video media header
static size_t mp4_write_vmhd(struct mp4_mux *mux)
{
struct serializer *s = mux->serializer;
/* Flags is always 1 */
write_fullbox(s, 20, "vmhd", 0, 1);
s_wb16(s, 0); // graphicsmode
s_wb16(s, 0); // opcolor r
s_wb16(s, 0); // opcolor g
s_wb16(s, 0); // opcolor b
return 16;
}
/// 12.2.2 Sound media header
static size_t mp4_write_smhd(struct mp4_mux *mux)
{
struct serializer *s = mux->serializer;
write_fullbox(s, 16, "smhd", 0, 0);
s_wb16(s, 0); // balance
s_wb16(s, 0); // reserved
return 16;
}
/// (QTFF/Apple) Text media information atom
static size_t mp4_write_qt_text(struct mp4_mux *mux)
{
struct serializer *s = mux->serializer;
int64_t start = serializer_get_pos(s);
write_box(s, 0, "text");
/* Identity matrix, note that it's not fixed point 16.16 */
s_wb16(s, 0x01);
s_wb32(s, 0x00);
s_wb32(s, 0x00);
s_wb32(s, 0x00);
s_wb32(s, 0x01);
s_wb32(s, 0x00);
s_wb32(s, 0x00);
s_wb32(s, 0x00);
s_wb32(s, 0x00004000);
/* Seemingly undocumented */
s_wb16(s, 0x0000);
return write_box_size(s, start);
}
/// (QTFF/Apple) Base media info atom
static size_t mp4_write_gmin(struct mp4_mux *mux)
{
struct serializer *s = mux->serializer;
int64_t start = serializer_get_pos(s);
write_fullbox(s, 0, "gmin", 0, 0);
s_wb16(s, 0x40); // graphics mode
s_wb16(s, 0x8000); // opColor r
s_wb16(s, 0x8000); // opColor g
s_wb16(s, 0x8000); // opColor b
s_wb16(s, 0); // balance
s_wb16(s, 0); // reserved
return write_box_size(s, start);
}
/// (QTFF/Apple) Base media information header atom
static size_t mp4_write_gmhd(struct mp4_mux *mux)
{
struct serializer *s = mux->serializer;
int64_t start = serializer_get_pos(s);
write_box(s, 0, "gmhd");
// gmin
mp4_write_gmin(mux);
// text (QuickTime)
mp4_write_qt_text(mux);
return write_box_size(s, start);
}
/// ISO/IEC 14496-15 5.4.2.1 AVCConfigurationBox
static size_t mp4_write_avcC(struct mp4_mux *mux, obs_encoder_t *enc)
{
struct serializer *s = mux->serializer;
/* For AVC this is the parsed extra data. */
uint8_t *header;
size_t size;
struct encoder_packet packet = {.type = OBS_ENCODER_VIDEO,
.timebase_den = 1,
.keyframe = true};
if (!obs_encoder_get_extra_data(enc, &header, &size))
return 0;
packet.size = obs_parse_avc_header(&packet.data, header, size);
size_t box_size = packet.size + 8;
write_box(s, box_size, "avcC");
s_write(s, packet.data, packet.size);
bfree(packet.data);
return box_size;
}
/// ISO/IEC 14496-15 8.4.1.1 HEVCConfigurationBox
static size_t mp4_write_hvcC(struct mp4_mux *mux, obs_encoder_t *enc)
{
struct serializer *s = mux->serializer;
/* For HEVC this is the parsed extra data. */
uint8_t *header;
size_t size;
struct encoder_packet packet = {.type = OBS_ENCODER_VIDEO,
.timebase_den = 1,
.keyframe = true};
if (!obs_encoder_get_extra_data(enc, &header, &size))
return 0;
packet.size = obs_parse_hevc_header(&packet.data, header, size);
size_t box_size = packet.size + 8;
write_box(s, box_size, "hvcC");
s_write(s, packet.data, packet.size);
bfree(packet.data);
return box_size;
}
/// AV1 ISOBMFF 2.3. AV1 Codec Configuration Box
static size_t mp4_write_av1C(struct mp4_mux *mux, obs_encoder_t *enc)
{
struct serializer *s = mux->serializer;
/* For AV1 this is just the parsed extra data. */
uint8_t *header;
size_t size;
struct encoder_packet packet = {.type = OBS_ENCODER_VIDEO,
.timebase_den = 1,
.keyframe = true};
if (!obs_encoder_get_extra_data(enc, &header, &size))
return 0;
packet.size = obs_parse_av1_header(&packet.data, header, size);
size_t box_size = packet.size + 8;
write_box(s, box_size, "av1C");
s_write(s, packet.data, packet.size);
bfree(packet.data);
return box_size;
}
/// 12.1.5 Colour information
static size_t mp4_write_colr(struct mp4_mux *mux, obs_encoder_t *enc)
{
UNUSED_PARAMETER(enc);
struct serializer *s = mux->serializer;
write_box(s, 19, "colr");
uint8_t full_range = 0;
uint16_t pri, trc, spc;
pri = trc = spc = 0;
get_colour_information(enc, &pri, &trc, &spc, &full_range);
s_write(s, "nclx", 4); // colour_type
s_wb16(s, pri); // colour_primaries
s_wb16(s, trc); // transfer_characteristics
s_wb16(s, spc); // matrix_coefficiencts
s_w8(s, full_range << 7); // full range flag + 7 reserved bits (0)
return 19;
}
/// 12.1.4 Pixel Aspect Ratio
static size_t mp4_write_pasp(struct mp4_mux *mux)
{
struct serializer *s = mux->serializer;
write_box(s, 16, "pasp");
s_wb32(s, 1); // hSpacing
s_wb32(s, 1); // vSpacing
return 16;
}
/// 12.1.3 Visual Sample Entry
static inline void mp4_write_visual_sample_entry(struct mp4_mux *mux,
obs_encoder_t *enc)
{
struct serializer *s = mux->serializer;
// SampleEntry Box
s_w8(s, 0); // reserved
s_w8(s, 0);
s_w8(s, 0);
s_w8(s, 0);
s_w8(s, 0);
s_w8(s, 0);
s_wb16(s, 1); // data_reference_index
// VisualSampleEntry Box
s_wb16(s, 0); // pre_defined
s_wb16(s, 0); // reserved
s_wb32(s, 0); // pre_defined
s_wb32(s, 0); // pre_defined
s_wb32(s, 0); // pre_defined
s_wb16(s, (uint16_t)obs_encoder_get_width(enc)); // width
s_wb16(s, (uint16_t)obs_encoder_get_height(enc)); // height
s_wb32(s, 0x00480000); // horizresolution (predefined)
s_wb32(s, 0x00480000); // vertresolution (predefined)
s_wb32(s, 0); // reserved
s_wb16(s, 1); // frame_count
/* Name is fixed 32-bytes and needs to be padded to that length.
* First byte is the length, rest is a string sans NULL terminator. */
char compressor_name[32] = {0};
const char *enc_id = obs_encoder_get_id(enc);
if (enc_id) {
size_t len = strlen(enc_id);
if (len > 31)
len = 31;
compressor_name[0] = (char)len;
memcpy(compressor_name + 1, enc_id, len);
}
s_write(s, compressor_name, sizeof(compressor_name)); // compressorname
s_wb16(s, 0x0018); // depth
s_wb16(s, -1); // pre_defined
}
/// 12.1.6 Content light level
static size_t mp4_write_clli(struct mp4_mux *mux, obs_encoder_t *enc)
{
struct serializer *s = mux->serializer;
video_t *video = obs_encoder_video(enc);
const struct video_output_info *info = video_output_get_info(video);
/* Only write box for HDR video */
if (info->colorspace != VIDEO_CS_2100_PQ &&
info->colorspace != VIDEO_CS_2100_HLG)
return 0;
write_box(s, 12, "clli");
float nominal_peak = obs_get_video_hdr_nominal_peak_level();
s_wb16(s, (uint16_t)nominal_peak); // max_content_light_level
s_wb16(s, (uint16_t)nominal_peak); // max_pic_average_light_level
return 12;
}
/// 12.1.7 Mastering display colour volume
static size_t mp4_write_mdcv(struct mp4_mux *mux, obs_encoder_t *enc)
{
struct serializer *s = mux->serializer;
video_t *video = obs_encoder_video(enc);
const struct video_output_info *info = video_output_get_info(video);
// Only write atom for HDR video
if (info->colorspace != VIDEO_CS_2100_PQ &&
info->colorspace != VIDEO_CS_2100_HLG)
return 0;
write_box(s, 32, "mdcv");
float nominal_peak = obs_get_video_hdr_nominal_peak_level();
uint32_t max_lum = (uint32_t)nominal_peak * 10000;
/* Note that these values are hardcoded everywhere in OBS, so these are
* just the same as used in our other muxers/encoders. */
// 3 x display_primaries (x, y) pairs
s_wb16(s, 13250);
s_wb16(s, 34500);
s_wb16(s, 7500);
s_wb16(s, 3000);
s_wb16(s, 34000);
s_wb16(s, 16000);
s_wb16(s, 15635); // white_point_x
s_wb16(s, 16450); // white_point_y
s_wb32(s, max_lum); // max_display_mastering_luminance
s_wb32(s, 0); // min_display_mastering_luminance
return 32;
}
/// ISO/IEC 14496-15 5.4.2.1 AVCSampleEntry
static size_t mp4_write_avc1(struct mp4_mux *mux, obs_encoder_t *enc)
{
struct serializer *s = mux->serializer;
int64_t start = serializer_get_pos(s);
write_box(s, 0, "avc1");
mp4_write_visual_sample_entry(mux, enc);
// avcC
mp4_write_avcC(mux, enc);
// colr
mp4_write_colr(mux, enc);
// pasp
mp4_write_pasp(mux);
return write_box_size(s, start);
}
/// ISO/IEC 14496-15 8.4.1.1 HEVCSampleEntry
static size_t mp4_write_hvc1(struct mp4_mux *mux, obs_encoder_t *enc)
{
struct serializer *s = mux->serializer;
int64_t start = serializer_get_pos(s);
write_box(s, 0, "hvc1");
mp4_write_visual_sample_entry(mux, enc);
// avcC
mp4_write_hvcC(mux, enc);
// colr
mp4_write_colr(mux, enc);
// clli
mp4_write_clli(mux, enc);
// mdcv
mp4_write_mdcv(mux, enc);
// pasp
mp4_write_pasp(mux);
return write_box_size(s, start);
}
/// AV1 ISOBMFF 2.2. AV1 Sample Entry
static size_t mp4_write_av01(struct mp4_mux *mux, obs_encoder_t *enc)
{
struct serializer *s = mux->serializer;
int64_t start = serializer_get_pos(s);
write_box(s, 0, "av01");
mp4_write_visual_sample_entry(mux, enc);
// avcC
mp4_write_av1C(mux, enc);
// colr
mp4_write_colr(mux, enc);
// clli
mp4_write_clli(mux, enc);
// mdcv
mp4_write_mdcv(mux, enc);
// pasp
mp4_write_pasp(mux);
return write_box_size(s, start);
}
static inline void put_descr(struct serializer *s, uint8_t tag, size_t size)
{
int i = 3;
s_w8(s, tag);
for (; i > 0; i--)
s_w8(s, (uint8_t)((size >> (7 * i)) | 0x80));
s_w8(s, size & 0x7F);
}
/// ISO/IEC 14496-14 5.6 ESDBox
static size_t mp4_write_esds(struct mp4_mux *mux, struct mp4_track *track)
{
struct serializer *s = mux->serializer;
int64_t start = serializer_get_pos(s);
write_fullbox(s, 0, "esds", 0, 0);
/* Encoder extradata will be used as DecoderSpecificInfo */
uint8_t *extradata;
size_t extradata_size;
if (!obs_encoder_get_extra_data(track->encoder, &extradata,
&extradata_size)) {
extradata_size = 0;
}
/// ISO/IEC 14496-1
// ES_Descriptor
size_t decoder_specific_info_len = extradata_size ? extradata_size + 5
: 0;
put_descr(s, 0x03, 3 + 5 + 13 + decoder_specific_info_len + 5 + 1);
s_wb16(s, track->track_id);
s_w8(s, 0x00); // flags
// DecoderConfigDescriptor
put_descr(s, 0x04, 13 + decoder_specific_info_len);
s_w8(s, 0x40); // codec tag, 0x40 = AAC
s_w8(s, 0x15); // stream type field (0x15 = audio stream)
/* When writing the final MOOV this could theoretically be calculated
* based on chunks, but it's not really all that important. */
uint32_t bitrate = 0;
obs_data_t *settings = obs_encoder_get_settings(track->encoder);
if (settings) {
int64_t enc_bitrate = obs_data_get_int(settings, "bitrate");
if (enc_bitrate)
bitrate = (uint32_t)(enc_bitrate * 1000);
obs_data_release(settings);
}
s_wb24(s, 0); // bufferSizeDB (in bytes)
s_wb32(s, bitrate); // maxbitrate
s_wb32(s, bitrate); // avgBitrate
// DecoderSpecificInfo
if (extradata_size) {
put_descr(s, 0x05, extradata_size);
s_write(s, extradata, extradata_size);
}
// SLConfigDescriptor descriptor
put_descr(s, 0x06, 1);
s_w8(s, 0x02); // 0x2 = reserved for MP4, descriptor is empty
return write_box_size(s, start);
}
/// 12.2.3 Audio Sample Entry
static inline void mp4_write_audio_sample_entry(struct mp4_mux *mux,
struct mp4_track *track,
uint8_t version)
{
struct serializer *s = mux->serializer;
// SampleEntry Box
s_w8(s, 0); // reserved
s_w8(s, 0);
s_w8(s, 0);
s_w8(s, 0);
s_w8(s, 0);
s_w8(s, 0);
s_wb16(s, 1); // data_reference_index
// AudioSampleEntry Box
if (version == 1) {
s_wb16(s, 1); // entry_version
s_wb16(s, 0); // reserved
s_wb16(s, 0); // reserved
s_wb16(s, 0); // reserved
} else {
s_wb32(s, 0); // reserved
s_wb32(s, 0); // reserved
}
audio_t *audio = obs_encoder_audio(track->encoder);
size_t channels = audio_output_get_channels(audio);
uint32_t sample_rate = track->timescale;
bool alac = track->codec == CODEC_ALAC;
s_wb16(s, (uint32_t)channels); // channelcount
/* OBS FLAC is currently always 16 bit, ALAC always 24, this may change
* in the futrure and should be handled differently then.
* That being said thoes codecs are self-describing so in most cases it
* shouldn't matter either way. */
s_wb16(s, alac ? 24 : 16); // samplesize
s_wb16(s, 0); // pre_defined
s_wb16(s, 0); // reserved
s_wb32(s, sample_rate << 16); // samplerate
}
/// 12.2.4 Channel layout
static size_t mp4_write_chnl(struct mp4_mux *mux, struct mp4_track *track)
{
struct serializer *s = mux->serializer;
int64_t start = serializer_get_pos(s);
write_fullbox(s, 0, "chnl", 0, 0);
audio_t *audio = obs_encoder_audio(track->encoder);
const struct audio_output_info *info = audio_output_get_info(audio);
s_w8(s, 1); // stream_structure (1 = channels)
/* 5.1 and 4.1 do not have a corresponding ISO layout, so we have to
* write a manually created channel map for those. */
uint8_t map[8] = {0};
uint8_t items = 0;
uint8_t defined_layout = 0;
get_speaker_positions(info->speakers, map, &items, &defined_layout);
if (!defined_layout) {
warn("No ISO layout available for speaker layout %d, "
"this may not be supported by all applications!",
info->speakers);
s_w8(s, 0); // definedLayout
s_write(s, map, items); // uint8_t speaker_position[count]
} else {
s_w8(s, defined_layout); // definedLayout
s_wb64(s, 0); // ommitedChannelMap
}
return write_box_size(s, start);
}
/// ISO/IEC 14496-14 5.6 MP4AudioSampleEntry
static size_t mp4_write_mp4a(struct mp4_mux *mux, struct mp4_track *track,
uint8_t version)
{
struct serializer *s = mux->serializer;
int64_t start = serializer_get_pos(s);
write_box(s, 0, "mp4a");
mp4_write_audio_sample_entry(mux, track, version);
// esds
mp4_write_esds(mux, track);
/* Write channel layout for version 1 sample entires */
if (version == 1)
mp4_write_chnl(mux, track);
return write_box_size(s, start);
}
/// Encapsulation of FLAC in ISO Base Media File Format 3.3.2 FLAC Specific Box
static size_t mp4_write_dfLa(struct mp4_mux *mux, struct mp4_track *track)
{
struct serializer *s = mux->serializer;
int64_t start = serializer_get_pos(s);
uint8_t *extradata;
size_t extradata_size;
if (!obs_encoder_get_extra_data(track->encoder, &extradata,
&extradata_size))
return 0;
write_fullbox(s, 0, "dfLa", 0, 0);
/// FLACMetadataBlock
// LastMetadataBlockFlag (1) | BlockType (0)
s_w8(s, 1 << 7 | 0);
// Length
s_wb24(s, (uint32_t)extradata_size);
// BlockData[Length]
s_write(s, extradata, extradata_size);
return write_box_size(s, start);
}
/// Encapsulation of FLAC in ISO Base Media File Format 3.3.1 FLACSampleEntry
static size_t mp4_write_fLaC(struct mp4_mux *mux, struct mp4_track *track,
uint8_t version)
{
struct serializer *s = mux->serializer;
int64_t start = serializer_get_pos(s);
write_box(s, 0, "fLaC");
mp4_write_audio_sample_entry(mux, track, version);
// dfLa
mp4_write_dfLa(mux, track);
if (version == 1)
mp4_write_chnl(mux, track);
return write_box_size(s, start);
}
/// Apple Lossless Format "Magic Cookie" Description - MP4/M4A File
static size_t mp4_write_alac(struct mp4_mux *mux, struct mp4_track *track,
uint8_t version)
{
struct serializer *s = mux->serializer;
int64_t start = serializer_get_pos(s);
uint8_t *extradata;
size_t extradata_size;
if (!obs_encoder_get_extra_data(track->encoder, &extradata,
&extradata_size))
return 0;
write_box(s, 0, "alac");
mp4_write_audio_sample_entry(mux, track, version);
/* Apple Lossless Magic Cookie */
s_write(s, extradata, extradata_size);
if (version == 1)
mp4_write_chnl(mux, track);
return write_box_size(s, start);
}
/// ISO/IEC 23003-5 5.1 PCM configuration
static size_t mp4_write_pcmc(struct mp4_mux *mux, struct mp4_track *track)
{
struct serializer *s = mux->serializer;
int64_t start = serializer_get_pos(s);
write_fullbox(s, 0, "pcmC", 0, 0);
s_w8(s, 1); // endianness, 1 = little endian
// bits per sample
if (track->codec == CODEC_PCM_I16)
s_w8(s, 16);
else if (track->codec == CODEC_PCM_I24)
s_w8(s, 24);
else if (track->codec == CODEC_PCM_F32)
s_w8(s, 32);
return write_box_size(s, start);
}
/// ISO/IEC 23003-5 5.1 PCM configuration
static size_t mp4_write_xpcm(struct mp4_mux *mux, struct mp4_track *track,
uint8_t version)
{
struct serializer *s = mux->serializer;
int64_t start = serializer_get_pos(s);
/* Different box types for floating point and integer PCM*/
write_box(s, 0, track->codec == CODEC_PCM_F32 ? "fpcm" : "ipcm");
mp4_write_audio_sample_entry(mux, track, version);
/* ChannelLayout (chnl) is required for PCM */
mp4_write_chnl(mux, track);
// pcmc
mp4_write_pcmc(mux, track);
return write_box_size(s, start);
}
/// (QTFF/Apple) Text sample description
static size_t mp4_write_text(struct mp4_mux *mux)
{
struct serializer *s = mux->serializer;
int64_t start = serializer_get_pos(s);
write_fullbox(s, 0, "text", 0, 0);
s_wb32(s, 1); // number of entries
/* Preset sample description as used by FFmpeg. */
s_write(s, &TEXT_STUB_HEADER, sizeof(TEXT_STUB_HEADER));
return write_box_size(s, start);
}
static inline uint32_t rl32(const uint8_t *ptr)
{
return (ptr[3] << 24) + (ptr[2] << 16) + (ptr[1] << 8) + ptr[0];
}
static inline uint16_t rl16(const uint8_t *ptr)
{
return (ptr[1] << 8) + ptr[0];
}
/// Encapsulation of Opus in ISO Base Media File Format 4.3.2 Opus Specific Box
static size_t mp4_write_dOps(struct mp4_mux *mux, struct mp4_track *track)
{
struct serializer *s = mux->serializer;
int64_t start = serializer_get_pos(s);
uint8_t *extradata;
size_t extradata_size;
if (!obs_encoder_get_extra_data(track->encoder, &extradata,
&extradata_size))
return 0;
write_box(s, 0, "dOps");
s_w8(s, 0); // version
uint8_t channels = *(extradata + 9);
uint8_t channel_map = *(extradata + 18);
s_w8(s, channels); // channel count
// OpusHead is little-endian, but MP4 is big-endian, so we have to swap them here
s_wb16(s, rl16(extradata + 10)); // pre-skip
s_wb32(s, rl32(extradata + 12)); // input sample rate
s_wb16(s, rl16(extradata + 16)); // output gain
s_w8(s, channel_map); // channel mapping family
if (channel_map)
s_write(s, extradata + 19, 2 + channels);
return write_box_size(s, start);
}
/// Encapsulation of Opus in ISO Base Media File Format 4.3.1 Sample entry format
static size_t mp4_write_Opus(struct mp4_mux *mux, struct mp4_track *track,
uint8_t version)
{
struct serializer *s = mux->serializer;
int64_t start = serializer_get_pos(s);
write_box(s, 0, "Opus");
mp4_write_audio_sample_entry(mux, track, version);
// dOps
mp4_write_dOps(mux, track);
if (version == 1)
mp4_write_chnl(mux, track);
return write_box_size(s, start);
}
/// 8.5.2 Sample Description Box
static size_t mp4_write_stsd(struct mp4_mux *mux, struct mp4_track *track)
{
struct serializer *s = mux->serializer;
int64_t start = serializer_get_pos(s);
/* Anything but mono or stereo technically requires v1,
* but in practice that doesn't appear to matter. */
uint8_t version = 0;
if (track->type == TRACK_AUDIO) {
audio_t *audio = obs_encoder_audio(track->encoder);
version = audio_output_get_channels(audio) > 2 ? 1 : 0;
}
write_fullbox(s, 0, "stsd", version, 0);
s_wb32(s, 1); // entry_count
// codec specific boxes
if (track->type == TRACK_VIDEO) {
if (track->codec == CODEC_H264)
mp4_write_avc1(mux, track->encoder);
else if (track->codec == CODEC_HEVC)
mp4_write_hvc1(mux, track->encoder);
else if (track->codec == CODEC_AV1)
mp4_write_av01(mux, track->encoder);
} else if (track->type == TRACK_AUDIO) {
if (track->codec == CODEC_AAC)
mp4_write_mp4a(mux, track, version);
else if (track->codec == CODEC_OPUS)
mp4_write_Opus(mux, track, version);
else if (track->codec == CODEC_FLAC)
mp4_write_fLaC(mux, track, version);
else if (track->codec == CODEC_ALAC)
mp4_write_alac(mux, track, version);
else if (track->codec == CODEC_PCM_I16 ||
track->codec == CODEC_PCM_I24 ||
track->codec == CODEC_PCM_F32)
mp4_write_xpcm(mux, track, version);
} else if (track->type == TRACK_CHAPTERS) {
mp4_write_text(mux);
}
return write_box_size(s, start);
}
/// 8.6.1.2 Decoding Time to Sample Box
static size_t mp4_write_stts(struct mp4_mux *mux, struct mp4_track *track,
bool fragmented)
{
struct serializer *s = mux->serializer;
if (fragmented) {
write_fullbox(s, 16, "stts", 0, 0);
s_wb32(s, 0); // entry_count
return 16;
}
int64_t start = serializer_get_pos(s);
struct sample_delta *arr = track->deltas.array;
size_t num = track->deltas.num;
write_fullbox(s, 0, "stts", 0, 0);
s_wb32(s, (uint32_t)num); // entry_count
for (size_t idx = 0; idx < num; idx++) {
struct sample_delta *smp = &arr[idx];
uint64_t delta = util_mul_div64(smp->delta, track->timescale,
track->timebase_den);
s_wb32(s, smp->count); // sample_count
s_wb32(s, (uint32_t)delta); // sample_delta
}
return write_box_size(s, start);
}
/// 8.6.2 Sync Sample Box
static size_t mp4_write_stss(struct mp4_mux *mux, struct mp4_track *track)
{
struct serializer *s = mux->serializer;
uint32_t num = (uint32_t)track->sync_samples.num;
if (!num)
return 0;
/* 16 byte FullBox header + 4-bytes (u32) per sync sample */
uint32_t size = 16 + 4 * num;
write_fullbox(s, size, "stss", 0, 0);
s_wb32(s, num); // entry_count
for (size_t idx = 0; idx < num; idx++)
s_wb32(s, track->sync_samples.array[idx]); // sample_number
return size;
}
/// 8.6.1.3 Composition Time to Sample Box
static size_t mp4_write_ctts(struct mp4_mux *mux, struct mp4_track *track)
{
struct serializer *s = mux->serializer;
uint32_t num = (uint32_t)track->offsets.num;
uint8_t version = mux->flags & MP4_USE_NEGATIVE_CTS ? 1 : 0;
/* 16 byte FullBox header + 8-bytes (u32+u32/i32) per offset entry */
uint32_t size = 16 + 8 * num;
write_fullbox(s, size, "ctts", version, 0);
s_wb32(s, num); // entry_count
for (size_t idx = 0; idx < num; idx++) {
int64_t offset = (int64_t)track->offsets.array[idx].offset *
(int64_t)track->timescale /
(int64_t)track->timebase_den;
s_wb32(s, track->offsets.array[idx].count); // sample_count
s_wb32(s, (uint32_t)offset); // sample_offset
}
return size;
}
/// 8.7.4 Sample To Chunk Box
static size_t mp4_write_stsc(struct mp4_mux *mux, struct mp4_track *track,
bool fragmented)
{
struct serializer *s = mux->serializer;
if (fragmented) {
write_fullbox(s, 16, "stsc", 0, 0);
s_wb32(s, 0); // entry_count
return 16;
}
struct chunk *arr = track->chunks.array;
size_t arr_num = track->chunks.num;
/* Compress into array with counter for repeating chunk sizes */
DARRAY(struct chunk_run {
uint32_t first;
uint32_t samples;
}) chunk_runs;
da_init(chunk_runs);
for (size_t idx = 0; idx < arr_num; idx++) {
struct chunk *chk = &arr[idx];
if (!chunk_runs.num ||
chunk_runs.array[chunk_runs.num - 1].samples !=
chk->samples) {
struct chunk_run *cr = da_push_back_new(chunk_runs);
cr->samples = chk->samples;
cr->first = (uint32_t)idx + 1; // ISO-BMFF is 1-indexed
}
}
uint32_t num = (uint32_t)chunk_runs.num;
/* 16 byte FullBox header + 12-bytes (u32+u32+u32) per chunk run */
uint32_t size = 16 + 12 * num;
write_fullbox(s, size, "stsc", 0, 0);
s_wb32(s, num); // entry_count
for (size_t idx = 0; idx < num; idx++) {
struct chunk_run *cr = &chunk_runs.array[idx];
s_wb32(s, cr->first); // first_chunk
s_wb32(s, cr->samples); // samples_per_chunk
s_wb32(s, 1); // sample_description_index
}
da_free(chunk_runs);
return size;
}
/// 8.7.3 Sample Size Boxes
static size_t mp4_write_stsz(struct mp4_mux *mux, struct mp4_track *track,
bool fragmented)
{
struct serializer *s = mux->serializer;
if (fragmented) {
write_fullbox(s, 20, "stsz", 0, 0);
s_wb32(s, 0); // sample_size
s_wb32(s, 0); // sample_count
return 20;
}
int64_t start = serializer_get_pos(s);
/* This should only ever happen when recording > 24 hours of
* 48 kHz PCM audio or 828 days of 60 FPS video. */
if (track->samples > UINT32_MAX) {
warn("Track %u has too many samples, its duration may not be "
"read correctly. Remuxing the file to another format such "
"as MKV may be required.",
track->track_id);
}
write_fullbox(s, 0, "stsz", 0, 0);
if (track->sample_size) {
/* Fixed size samples mean we don't need an array */
s_wb32(s, track->sample_size); // sample_size
s_wb32(s, (uint32_t)track->samples); // sample_count
} else {
s_wb32(s, 0); // sample_size
s_wb32(s, (uint32_t)track->sample_sizes.num); // sample_count
for (size_t idx = 0; idx < track->sample_sizes.num; idx++) {
s_wb32(s, track->sample_sizes.array[idx]); // entry_size
}
}
return write_box_size(s, start);
}
/// 8.7.5 Chunk Offset Box
static size_t mp4_write_stco(struct mp4_mux *mux, struct mp4_track *track,
bool fragmented)
{
struct serializer *s = mux->serializer;
if (fragmented) {
write_fullbox(s, 16, "stco", 0, 0);
s_wb32(s, 0); // entry_count
return 16;
}
struct chunk *arr = track->chunks.array;
uint32_t num = (uint32_t)track->chunks.num;
uint64_t last_off = arr[num - 1].offset;
uint32_t size;
bool co64 = last_off > UINT32_MAX;
/* When using 64-bit offsets we write 8-bytes (u64) per chunk,
* otherwise 4-bytes (u32). */
if (co64) {
size = 16 + 8 * num;
write_fullbox(s, size, "co64", 0, 0);
} else {
size = 16 + 4 * num;
write_fullbox(s, size, "stco", 0, 0);
}
s_wb32(s, num); // entry_count
for (size_t idx = 0; idx < num; idx++) {
if (co64)
s_wb64(s, arr[idx].offset); // chunk_offset
else
s_wb32(s, (uint32_t)arr[idx].offset); // chunk_offset
}
return size;
}
/// 8.9.3 Sample Group Description Box
static size_t mp4_write_sgpd_aac(struct mp4_mux *mux)
{
struct serializer *s = mux->serializer;
int64_t start = serializer_get_pos(s);
write_fullbox(s, 0, "sgpd", 1, 0);
s_write(s, "roll", 4); // grouping_tpye
s_wb32(s, 2); // default_length (i16)
s_wb32(s, 1); // entry_count
// AudioRollRecoveryEntry
s_wb16(s, -1); // roll_distance
return write_box_size(s, start);
}
/// 8.9.2 Sample to Group Box
static size_t mp4_write_sbgp_aac(struct mp4_mux *mux, struct mp4_track *track)
{
struct serializer *s = mux->serializer;
int64_t start = serializer_get_pos(s);
write_fullbox(s, 0, "sbgp", 0, 0);
/// 10.1 AudioRollRecoveryEntry
s_write(s, "roll", 4); // grouping_tpye
s_wb32(s, 1); // entry_count
s_wb32(s, (uint32_t)track->samples); // sample_count
s_wb32(s, 1); // group_description_index
return write_box_size(s, start);
}
static size_t mp4_write_sbgp_sbgp_opus(struct mp4_mux *mux,
struct mp4_track *track)
{
struct serializer *s = mux->serializer;
int64_t start = serializer_get_pos(s);
/// 8.9.3 Sample Group Description Box
write_fullbox(s, 0, "sgpd", 1, 0);
s_write(s, "roll", 4); // grouping_tpye
s_wb32(s, 2); // default_length (i16)
/* Opus requires 80 ms of preroll, which at 48 kHz is 3840 PCM samples */
const int64_t opus_preroll = 3840;
/* Compute the preroll samples (should be 4, each being 20 ms) */
uint16_t preroll_count = 0;
int64_t preroll_remaining = opus_preroll;
for (size_t i = 0; i < track->deltas.num && preroll_remaining > 0;
i++) {
for (uint32_t j = 0;
j < track->deltas.array[i].count && preroll_remaining > 0;
j++) {
preroll_remaining -= track->deltas.array[i].delta;
preroll_count++;
}
}
s_wb32(s, 1); // entry_count
/// 10.1 AudioRollRecoveryEntry
s_wb16(s, -preroll_count); // roll_distance
size_t size_sgpd = write_box_size(s, start);
/* --------------- */
/// 8.9.2 Sample to Group Box
start = serializer_get_pos(s);
write_fullbox(s, 0, "sbgp", 0, 0);
s_write(s, "roll", 4); // grouping_tpye
s_wb32(s, 2); // entry_count
// entry 0
s_wb32(s, preroll_count); // sample_count
s_wb32(s, 0); // group_description_index
// entry 1
s_wb32(s, (uint32_t)track->samples - preroll_count); // sample_count
s_wb32(s, 1); // group_description_index
return size_sgpd + write_box_size(s, start);
}
/// 8.5.1 Sample Table Box
static size_t mp4_write_stbl(struct mp4_mux *mux, struct mp4_track *track,
bool fragmented)
{
struct serializer *s = mux->serializer;
int64_t start = serializer_get_pos(s);
write_box(s, 0, "stbl");
// stsd
mp4_write_stsd(mux, track);
// stts
mp4_write_stts(mux, track, fragmented);
// stss (non-fragmented only)
if (track->type == TRACK_VIDEO && !fragmented)
mp4_write_stss(mux, track);
// ctts (non-fragmented only)
if (track->needs_ctts && !fragmented)
mp4_write_ctts(mux, track);
// stsc
mp4_write_stsc(mux, track, fragmented);
// stsz
mp4_write_stsz(mux, track, fragmented);
// stco
mp4_write_stco(mux, track, fragmented);
if (!fragmented) {
/* AAC and Opus require a pre-roll to get correct decoder
* output, sgpd and sbgp are used to create a "roll" group. */
if (track->codec == CODEC_AAC) {
// sgpd
mp4_write_sgpd_aac(mux);
// sbgp
mp4_write_sbgp_aac(mux, track);
} else if (track->codec == CODEC_OPUS) {
// sgpd + sbgp
mp4_write_sbgp_sbgp_opus(mux, track);
}
}
return write_box_size(s, start);
}
/// 8.7.2.2 DataEntryUrlBox
static size_t mp4_write_url(struct mp4_mux *mux)
{
struct serializer *s = mux->serializer;
int64_t start = serializer_get_pos(s);
write_fullbox(s, 0, "url ", 0, 1);
/* empty, flag 1 means data is in this file */
return write_box_size(s, start);
}
/// 8.7.2 Data Reference Box
static size_t mp4_write_dref(struct mp4_mux *mux)
{
struct serializer *s = mux->serializer;
int64_t start = serializer_get_pos(s);
write_fullbox(s, 0, "dref ", 0, 0);
s_wb32(s, 1); // entry_count
mp4_write_url(mux);
return write_box_size(s, start);
}
/// 8.7.1 Data Information Box
static size_t mp4_write_dinf(struct mp4_mux *mux)
{
struct serializer *s = mux->serializer;
int64_t start = serializer_get_pos(s);
write_box(s, 0, "dinf");
mp4_write_dref(mux);
return write_box_size(s, start);
}
/// 8.4.4 Media Information Box
static size_t mp4_write_minf(struct mp4_mux *mux, struct mp4_track *track,
bool fragmented)
{
struct serializer *s = mux->serializer;
int64_t start = serializer_get_pos(s);
write_box(s, 0, "minf");
// vmhd/smhd/gmhd
if (track->type == TRACK_VIDEO)
mp4_write_vmhd(mux);
else if (track->type == TRACK_CHAPTERS)
mp4_write_gmhd(mux);
else
mp4_write_smhd(mux);
// dinf, unnecessary but mandatory
mp4_write_dinf(mux);
// stbl
mp4_write_stbl(mux, track, fragmented);
return write_box_size(s, start);
}
/// 8.4.1 Media Box
static size_t mp4_write_mdia(struct mp4_mux *mux, struct mp4_track *track,
bool fragmented)
{
struct serializer *s = mux->serializer;
int64_t start = serializer_get_pos(s);
write_box(s, 0, "mdia");
// mdhd
mp4_write_mdhd(mux, track);
// hdlr
mp4_write_hdlr(mux, track);
// minf
mp4_write_minf(mux, track, fragmented);
return write_box_size(s, start);
}
/// (QTFF/Apple) User data atom
static size_t mp4_write_udta_atom(struct mp4_mux *mux, const char tag[4],
const char *val)
{
struct serializer *s = mux->serializer;
int64_t start = serializer_get_pos(s);
write_box(s, 0, tag);
s_write(s, val, strlen(val));
return write_box_size(s, start);
}
/// 8.10.1 User Data Box
static size_t mp4_write_track_udta(struct mp4_mux *mux, struct mp4_track *track)
{
struct serializer *s = mux->serializer;
int64_t start = serializer_get_pos(s);
write_box(s, 0, "udta");
/* Our udta box contains QuickTime format user data atoms, which are
* simple key-value pairs. Some are prefixed with 0xa9. */
const char *name = obs_encoder_get_name(track->encoder);
if (name)
mp4_write_udta_atom(mux, "name", name);
if (mux->flags & MP4_WRITE_ENCODER_INFO) {
const char *id = obs_encoder_get_id(track->encoder);
if (name)
mp4_write_udta_atom(mux, "\251enc", id);
obs_data_t *settings = obs_encoder_get_settings(track->encoder);
if (settings) {
const char *json =
obs_data_get_json_with_defaults(settings);
mp4_write_udta_atom(mux, "json", json);
obs_data_release(settings);
}
}
return write_box_size(s, start);
}
/// 8.6.6 Edit List Box
static size_t mp4_write_elst(struct mp4_mux *mux, struct mp4_track *track)
{
struct serializer *s = mux->serializer;
int64_t start = serializer_get_pos(s);
write_fullbox(s, 0, "elst", 0, 0);
s_wb32(s, 1); // entry count
uint64_t duration =
util_mul_div64(track->duration, 1000, track->timebase_den);
uint64_t delay = 0;
if (track->type == TRACK_VIDEO &&
!(mux->flags & MP4_USE_NEGATIVE_CTS)) {
/* Compensate for frame-reordering delay (for example, when
* using b-frames). */
int64_t dts_offset = 0;
if (track->offsets.num) {
struct sample_offset sample = track->offsets.array[0];
dts_offset = sample.offset;
} else if (track->packets.size) {
/* If no offset data exists yet (i.e. when writing the
* incomplete moov in a fragmented file) use the raw
* data from the current queued packets instead. */
struct encoder_packet pkt;
deque_peek_front(&track->packets, &pkt, sizeof(pkt));
dts_offset = pkt.pts - pkt.dts;
}
delay = util_mul_div64(dts_offset, track->timescale,
track->timebase_den);
} else if (track->type == TRACK_AUDIO && track->first_pts < 0) {
delay = util_mul_div64(llabs(track->first_pts),
track->timescale, track->timebase_den);
/* Subtract priming delay from total duration */
duration -= util_mul_div64(delay, 1000, track->timescale);
}
s_wb32(s, (uint32_t)duration); // segment_duration (movie timescale)
s_wb32(s, (uint32_t)delay); // media_time (track timescale)
s_wb32(s, 1 << 16); // media_rate
return write_box_size(s, start);
}
/// 8.6.5 Edit Box
static size_t mp4_write_edts(struct mp4_mux *mux, struct mp4_track *track)
{
struct serializer *s = mux->serializer;
int64_t start = serializer_get_pos(s);
write_box(s, 0, "edts");
mp4_write_elst(mux, track);
return write_box_size(s, start);
}
/// 8.3.3.2 TrackReferenceTypeBox
static size_t mp4_write_chap(struct mp4_mux *mux)
{
struct serializer *s = mux->serializer;
int64_t start = serializer_get_pos(s);
/// QTFF/Apple chapter track reference
write_box(s, 0, "chap");
s_wb32(s, mux->chapter_track->track_id);
return write_box_size(s, start);
}
/// 8.3.3 Track Reference Box
static size_t mp4_write_tref(struct mp4_mux *mux)
{
struct serializer *s = mux->serializer;
int64_t start = serializer_get_pos(s);
write_box(s, 0, "tref");
mp4_write_chap(mux);
return write_box_size(s, start);
}
/// 8.3.1 Track Box
static size_t mp4_write_trak(struct mp4_mux *mux, struct mp4_track *track,
bool fragmented)
{
struct serializer *s = mux->serializer;
int64_t start = serializer_get_pos(s);
/* If track has no data, omit it from full moov. */
if (!fragmented && !track->chunks.num)
return 0;
write_box(s, 0, "trak");
// tkhd
mp4_write_tkhd(mux, track);
// edts
mp4_write_edts(mux, track);
// tref
if (mux->chapter_track && track->type != TRACK_CHAPTERS)
mp4_write_tref(mux);
// mdia
mp4_write_mdia(mux, track, fragmented);
// udta (audio track name mainly)
mp4_write_track_udta(mux, track);
return write_box_size(s, start);
}
/// 8.8.3 Track Extends Box
static size_t mp4_write_trex(struct mp4_mux *mux, uint32_t track_id)
{
struct serializer *s = mux->serializer;
write_fullbox(s, 32, "trex", 0, 0);
s_wb32(s, track_id); // track_ID
s_wb32(s, 1); // default_sample_description_index
s_wb32(s, 0); // default_sample_duration
s_wb32(s, 0); // default_sample_size
s_wb32(s, 0); // default_sample_flags
return 32;
}
/// 8.8.1 Movie Extends Box
static size_t mp4_write_mvex(struct mp4_mux *mux)
{
struct serializer *s = mux->serializer;
int64_t start = serializer_get_pos(s);
write_box(s, 0, "mvex");
for (size_t track_id = 0; track_id < mux->tracks.num; track_id++)
mp4_write_trex(mux, (uint32_t)(track_id + 1));
return write_box_size(s, start);
}
/// (QTFF/Apple) Undocumented QuickTime/iTunes metadata handler
static size_t mp4_write_itunes_hdlr(struct mp4_mux *mux)
{
struct serializer *s = mux->serializer;
write_fullbox(s, 33, "hdlr", 0, 0);
s_wb32(s, 0); // pre_defined
s_write(s, "mdir", 4); // handler_type
// reserved
s_write(s, "appl", 4);
s_wb32(s, 0);
s_wb32(s, 0);
s_w8(s, 0); // name (NULL)
return 33;
}
/// (QTFF/Apple) Data atom
static size_t mp4_write_data_atom(struct mp4_mux *mux, const char *data)
{
struct serializer *s = mux->serializer;
size_t len = strlen(data);
uint32_t size = 16 + (uint32_t)len;
write_box(s, size, "data");
s_wb32(s, 1); // type, 1 = utf-8 string
s_wb32(s, 0); // locale, 0 = default
s_write(s, data, len);
return size;
}
/// (QTFF/Apple) Metadata item atom
static size_t mp4_write_ilst_item_atom(struct mp4_mux *mux, const char name[4],
const char *value)
{
struct serializer *s = mux->serializer;
int64_t start = serializer_get_pos(s);
write_box(s, 0, name);
mp4_write_data_atom(mux, value);
return write_box_size(s, start);
}
/// (QTFF/Apple) Metadata item list atom
static size_t mp4_write_ilst(struct mp4_mux *mux)
{
struct serializer *s = mux->serializer;
struct dstr value = {0};
int64_t start = serializer_get_pos(s);
write_box(s, 0, "ilst");
/* Encoder name */
dstr_cat(&value, "OBS Studio (");
dstr_cat(&value, obs_get_version_string());
dstr_cat(&value, ")");
/* Some QuickTime keys are prefixed with 0xa9 */
mp4_write_ilst_item_atom(mux, "\251too", value.array);
dstr_free(&value);
return write_box_size(s, start);
}
/// (QTFF/Apple) Key value metadata handler
static size_t mp4_write_mdta_hdlr(struct mp4_mux *mux)
{
struct serializer *s = mux->serializer;
write_fullbox(s, 33, "hdlr", 0, 0);
s_wb32(s, 0); // pre_defined
s_write(s, "mdta", 4); // handler_type
// reserved
s_wb32(s, 0);
s_wb32(s, 0);
s_wb32(s, 0);
s_w8(s, 0); // name (NULL)
return 33;
}
/// (QTFF/Apple) Metadata item keys atom
static size_t mp4_write_mdta_keys(struct mp4_mux *mux, obs_data_t *meta)
{
struct serializer *s = mux->serializer;
int64_t start = serializer_get_pos(s);
write_fullbox(s, 0, "keys", 0, 0);
uint32_t count = 0;
int64_t count_pos = serializer_get_pos(s);
s_wb32(s, count); // count
obs_data_item_t *item = obs_data_first(meta);
for (; item != NULL; obs_data_item_next(&item)) {
const char *name = obs_data_item_get_name(item);
size_t len = strlen(name);
/* name is key type, can be udta or mdta */
write_box(s, len + 8, "mdta");
s_write(s, name, len); // key name
count++;
}
int64_t end = serializer_get_pos(s);
/* Overwrite count with correct value */
serializer_seek(s, count_pos, SERIALIZE_SEEK_START);
s_wb32(s, count);
serializer_seek(s, end, SERIALIZE_SEEK_START);
return write_box_size(s, start);
}
/// (QTFF/Apple) Metadata item atom, but name is an index instead
static inline void write_key_entry(struct mp4_mux *mux, obs_data_item_t *item,
uint32_t idx)
{
struct serializer *s = mux->serializer;
int64_t start = serializer_get_pos(s);
s_wb32(s, 0); // size
s_wb32(s, idx); // index
mp4_write_data_atom(mux, obs_data_item_get_string(item));
write_box_size(s, start);
}
/// (QTFF/Apple) Metadata item list atom
static size_t mp4_write_mdta_ilst(struct mp4_mux *mux, obs_data_t *meta)
{
struct serializer *s = mux->serializer;
int64_t start = serializer_get_pos(s);
write_box(s, 0, "ilst");
/* indices start with 1 */
uint32_t key_idx = 1;
obs_data_item_t *item = obs_data_first(meta);
for (; item != NULL; obs_data_item_next(&item)) {
write_key_entry(mux, item, key_idx);
key_idx++;
}
return write_box_size(s, start);
}
static void mp4_write_mdta_kv(struct mp4_mux *mux)
{
struct dstr value = {0};
obs_data_t *meta = obs_data_create();
dstr_cat(&value, "OBS Studio (");
dstr_cat(&value, obs_get_version_string());
dstr_cat(&value, ")");
// ToDo figure out what else we could put in here for fun and profit :)
obs_data_set_string(meta, "tool", value.array);
/* Write keys */
mp4_write_mdta_keys(mux, meta);
/* Write values */
mp4_write_mdta_ilst(mux, meta);
obs_data_release(meta);
dstr_free(&value);
}
/// 8.11.1 The Meta box
static size_t mp4_write_meta(struct mp4_mux *mux)
{
struct serializer *s = mux->serializer;
int64_t start = serializer_get_pos(s);
write_fullbox(s, 0, "meta", 0, 0);
if (mux->flags & MP4_USE_MDTA_KEY_VALUE) {
mp4_write_mdta_hdlr(mux);
mp4_write_mdta_kv(mux);
} else {
mp4_write_itunes_hdlr(mux);
mp4_write_ilst(mux);
}
return write_box_size(s, start);
}
/// 8.10.1 User Data Box
static size_t mp4_write_udta(struct mp4_mux *mux)
{
struct serializer *s = mux->serializer;
int64_t start = serializer_get_pos(s);
write_box(s, 0, "udta");
/* Normally metadata would be directly in the moov, but since this is
* Apple/QTFF format metadata it is inside udta. */
// meta
mp4_write_meta(mux);
return write_box_size(s, start);
}
/// Movie Box (8.2.1)
static size_t mp4_write_moov(struct mp4_mux *mux, bool fragmented)
{
struct serializer *s = mux->serializer;
int64_t start = serializer_get_pos(s);
write_box(s, 0, "moov");
mp4_write_mvhd(mux);
// trak(s)
for (size_t i = 0; i < mux->tracks.num; i++) {
struct mp4_track *track = &mux->tracks.array[i];
mp4_write_trak(mux, track, fragmented);
}
if (!fragmented && mux->chapter_track)
mp4_write_trak(mux, mux->chapter_track, false);
// mvex
if (fragmented)
mp4_write_mvex(mux);
// udta (metadata)
mp4_write_udta(mux);
return write_box_size(s, start);
}
/* ========================================================================== */
/* moof (fragment header) stuff */
/// 8.8.5 Movie Fragment Header Box
static size_t mp4_write_mfhd(struct mp4_mux *mux)
{
struct serializer *s = mux->serializer;
write_fullbox(s, 16, "mfhd", 0, 0);
s_wb32(s, mux->fragments_written); // sequence_number
return 16;
}
/// 8.8.7 Track Fragment Header Box
static size_t mp4_write_tfhd(struct mp4_mux *mux, struct mp4_track *track,
size_t moof_start)
{
struct serializer *s = mux->serializer;
int64_t start = serializer_get_pos(s);
uint32_t flags = BASE_DATA_OFFSET_PRESENT |
DEFAULT_SAMPLE_FLAGS_PRESENT;
/* Add default size/duration if all samples match. */
bool durations_match = true;
bool sizes_match = true;
uint32_t duration;
uint32_t sample_size;
if (track->sample_size) {
duration = 1;
sample_size = track->sample_size;
} else {
duration = track->fragment_samples.array[0].duration;
sample_size = track->fragment_samples.array[0].size;
for (size_t idx = 1; idx < track->fragment_samples.num; idx++) {
uint32_t frag_duration =
track->fragment_samples.array[idx].duration;
uint32_t frag_size =
track->fragment_samples.array[idx].size;
durations_match = frag_duration == duration;
sizes_match = frag_size == sample_size;
}
}
if (durations_match)
flags |= DEFAULT_SAMPLE_DURATION_PRESENT;
if (sizes_match)
flags |= DEFAULT_SAMPLE_SIZE_PRESENT;
write_fullbox(s, 0, "tfhd", 0, flags);
s_wb32(s, track->track_id); // track_ID
s_wb64(s, moof_start); // base_data_offset
// default_sample_duration
if (durations_match) {
if (track->type == TRACK_VIDEO) {
/* Convert duration to track timescale */
duration = (uint32_t)util_mul_div64(
duration, track->timescale,
track->timebase_den);
}
s_wb32(s, duration);
}
// default_sample_size
if (sizes_match)
s_wb32(s, sample_size);
// default_sample_flags
if (track->type == TRACK_VIDEO) {
s_wb32(s, SAMPLE_FLAG_DEPENDS_YES | SAMPLE_FLAG_IS_NON_SYNC);
} else {
s_wb32(s, SAMPLE_FLAG_DEPENDS_NO);
}
return write_box_size(s, start);
}
/// 8.8.12 Track fragment decode time
static size_t mp4_write_tfdt(struct mp4_mux *mux, struct mp4_track *track)
{
struct serializer *s = mux->serializer;
write_fullbox(s, 20, "tfdt", 1, 0);
/* Subtract samples that are not written yet */
uint64_t duration_written = track->duration;
for (size_t i = 0; i < track->fragment_samples.num; i++)
duration_written -= track->fragment_samples.array[i].duration;
if (track->type == TRACK_VIDEO) {
/* Convert to track timescale */
duration_written = util_mul_div64(duration_written,
track->timescale,
track->timebase_den);
}
s_wb64(s, duration_written); // baseMediaDecodeTime
return 20;
}
/// 8.8.8 Track Fragment Run Box
static size_t mp4_write_trun(struct mp4_mux *mux, struct mp4_track *track,
uint32_t moof_size, uint64_t *samples_mdat_offset)
{
struct serializer *s = mux->serializer;
int64_t start = serializer_get_pos(s);
uint32_t flags = DATA_OFFSET_PRESENT;
if (!track->sample_size)
flags |= SAMPLE_SIZE_PRESENT;
if (track->type == TRACK_VIDEO) {
flags |= FIRST_SAMPLE_FLAGS_PRESENT;
flags |= SAMPLE_COMPOSITION_TIME_OFFSETS_PRESENT;
}
uint8_t version = mux->flags & MP4_USE_NEGATIVE_CTS ? 1 : 0;
write_fullbox(s, 0, "trun", version, flags);
/* moof_size + 8 bytes for mdat header + offset into mdat box data */
size_t data_offset = moof_size + 8 + *samples_mdat_offset;
size_t sample_count = track->fragment_samples.num;
if (track->sample_size) {
/* Update count based on fixed size */
size_t total_size = 0;
for (size_t i = 0; i < sample_count; i++)
total_size += track->fragment_samples.array[i].size;
*samples_mdat_offset += total_size;
sample_count = total_size / track->sample_size;
}
s_wb32(s, (uint32_t)sample_count); // sample_count
s_wb32(s, (uint32_t)data_offset); // data_offset
/* If we have a fixed sample size (PCM audio) we only need to write
* the sample count and offset. */
if (track->sample_size)
return write_box_size(s, start);
if (track->type == TRACK_VIDEO)
s_wb32(s, SAMPLE_FLAG_DEPENDS_NO); // first_sample_flags
for (size_t idx = 0; idx < sample_count; idx++) {
struct fragment_sample *smp =
&track->fragment_samples.array[idx];
s_wb32(s, smp->size); // sample_size
if (track->type == TRACK_VIDEO) {
// sample_composition_time_offset
int64_t offset = (int64_t)smp->offset *
(int64_t)track->timescale /
(int64_t)track->timebase_den;
s_wb32(s, (uint32_t)offset);
}
*samples_mdat_offset += smp->size;
}
return write_box_size(s, start);
}
/// 8.8.6 Track Fragment Box
static size_t mp4_write_traf(struct mp4_mux *mux, struct mp4_track *track,
int64_t moof_start, uint32_t moof_size,
uint64_t *samples_mdat_offset)
{
struct serializer *s = mux->serializer;
int64_t start = serializer_get_pos(s);
write_box(s, 0, "traf");
// tfhd
mp4_write_tfhd(mux, track, moof_start);
// tfdt
mp4_write_tfdt(mux, track);
// trun
mp4_write_trun(mux, track, moof_size, samples_mdat_offset);
return write_box_size(s, start);
}
/// 8.8.4 Movie Fragment Box
static size_t mp4_write_moof(struct mp4_mux *mux, uint32_t moof_size,
int64_t moof_start)
{
struct serializer *s = mux->serializer;
int64_t start = serializer_get_pos(s);
write_box(s, 0, "moof");
mp4_write_mfhd(mux);
/* Track current mdat offset across tracks */
uint64_t samples_mdat_offset = 0;
// traf boxes
for (size_t i = 0; i < mux->tracks.num; i++) {
struct mp4_track *track = &mux->tracks.array[i];
/* Skip tracks that do not have any samples */
if (!track->fragment_samples.num)
continue;
mp4_write_traf(mux, track, moof_start, moof_size,
&samples_mdat_offset);
}
return write_box_size(s, start);
}
/* ========================================================================== */
/* Chapter packets */
static void mp4_create_chapter_pkt(struct encoder_packet *pkt, int64_t dts_usec,
const char *name)
{
int64_t dts = dts_usec / 1000; // chapter track uses a ms timebase
pkt->pts = dts;
pkt->dts = dts;
pkt->dts_usec = dts_usec;
pkt->timebase_num = 1;
pkt->timebase_den = 1000;
/* Serialize with data with ref count */
struct serializer s;
struct array_output_data ao;
array_output_serializer_init(&s, &ao);
size_t len = min(strlen(name), UINT16_MAX);
long refs = 1;
/* encoder_packet refs */
s_write(&s, &refs, sizeof(refs));
/* actual packet data */
s_wb16(&s, (uint16_t)len);
s_write(&s, name, len);
s_write(&s, &CHAPTER_PKT_FOOTER, sizeof(CHAPTER_PKT_FOOTER));
pkt->data = (void *)(ao.bytes.array + sizeof(long));
pkt->size = ao.bytes.num - sizeof(long);
}
/* ========================================================================== */
/* Encoder packet processing and fragment writer */
static inline int64_t packet_pts_usec(struct encoder_packet *packet)
{
return packet->pts * 1000000 / packet->timebase_den;
}
static inline struct encoder_packet *get_pkt_at(struct deque *dq, size_t idx)
{
return deque_data(dq, idx * sizeof(struct encoder_packet));
}
static inline uint64_t get_longest_track_duration(struct mp4_mux *mux)
{
uint64_t dur = 0;
for (size_t i = 0; i < mux->tracks.num; i++) {
struct mp4_track *track = &mux->tracks.array[i];
uint64_t track_dur = util_mul_div64(track->duration, 1000,
track->timebase_den);
if (track_dur > dur)
dur = track_dur;
}
return dur;
}
static void process_packets(struct mp4_mux *mux, struct mp4_track *track,
uint64_t *mdat_size)
{
size_t count = track->packets.size / sizeof(struct encoder_packet);
if (!count)
return;
/* Only iterate upt to penultimate packet so we can determine duration
* for all processed packets. */
for (size_t i = 0; i < count - 1; i++) {
struct encoder_packet *pkt = get_pkt_at(&track->packets, i);
if (mux->next_frag_pts &&
packet_pts_usec(pkt) >= mux->next_frag_pts)
break;
struct encoder_packet *next =
get_pkt_at(&track->packets, i + 1);
/* Duration is just distance between current and next DTS. */
uint32_t duration = (uint32_t)(next->dts - pkt->dts);
uint32_t sample_count = 1;
uint32_t size = (uint32_t)pkt->size;
int32_t offset = (int32_t)(pkt->pts - pkt->dts);
/* When using negative CTS, subtract DTS-PTS offset. */
if (track->type == TRACK_VIDEO &&
mux->flags & MP4_USE_NEGATIVE_CTS) {
if (!track->offsets.num)
track->dts_offset = offset;
offset -= track->dts_offset;
}
/* Create temporary sample information for moof */
struct fragment_sample *smp =
da_push_back_new(track->fragment_samples);
smp->size = size;
smp->offset = offset;
smp->duration = duration;
*mdat_size += size;
/* Update global sample information for full moov */
track->duration += duration;
if (track->sample_size) {
/* Adjust duration/count for fixed sample size */
sample_count = size / track->sample_size;
duration = 1;
}
if (!track->samples)
track->first_pts = pkt->pts;
track->samples += sample_count;
/* If delta (duration) matche sprevious, increment counter,
* otherwise create a new entry. */
if (track->deltas.num == 0 ||
track->deltas.array[track->deltas.num - 1].delta !=
duration) {
struct sample_delta *new =
da_push_back_new(track->deltas);
new->delta = duration;
new->count = sample_count;
} else {
track->deltas.array[track->deltas.num - 1].count +=
sample_count;
}
if (!track->sample_size)
da_push_back(track->sample_sizes, &size);
if (track->type != TRACK_VIDEO)
continue;
if (pkt->keyframe)
da_push_back(track->sync_samples, &track->samples);
/* Only require ctts box if offet is non-zero */
if (offset && !track->needs_ctts)
track->needs_ctts = true;
/* If dts-pts offset matche sprevious, increment counter,
* otherwise create a new entry. */
if (track->offsets.num == 0 ||
track->offsets.array[track->offsets.num - 1].offset !=
offset) {
struct sample_offset *new =
da_push_back_new(track->offsets);
new->offset = offset;
new->count = 1;
} else {
track->offsets.array[track->offsets.num - 1].count += 1;
}
}
}
/* Write track data to file */
static void write_packets(struct mp4_mux *mux, struct mp4_track *track)
{
struct serializer *s = mux->serializer;
size_t count = track->packets.size / sizeof(struct encoder_packet);
if (!count || !track->fragment_samples.num)
return;
struct chunk *chk = da_push_back_new(track->chunks);
chk->offset = serializer_get_pos(s);
chk->samples = (uint32_t)track->fragment_samples.num;
for (size_t i = 0; i < track->fragment_samples.num; i++) {
struct encoder_packet pkt;
deque_pop_front(&track->packets, &pkt,
sizeof(struct encoder_packet));
s_write(s, pkt.data, pkt.size);
obs_encoder_packet_release(&pkt);
}
chk->size = (uint32_t)(serializer_get_pos(s) - chk->offset);
/* Fixup sample count for fixed-size codecs */
if (track->sample_size)
chk->samples = chk->size / track->sample_size;
da_clear(track->fragment_samples);
}
static void mp4_flush_fragment(struct mp4_mux *mux)
{
struct serializer *s = mux->serializer;
// Write file header if not already done
if (!mux->fragments_written) {
mp4_write_ftyp(mux, true);
/* Placeholder to write mdat header during soft-remux */
mux->placeholder_offset = serializer_get_pos(s);
mp4_write_free(mux);
}
// Array output as temporary buffer to avoid sending seeks to disk
struct serializer as;
struct array_output_data aod;
array_output_serializer_init(&as, &aod);
mux->serializer = &as;
// Write initial incomplete moov (because fragmentation)
if (!mux->fragments_written) {
mp4_write_moov(mux, true);
s_write(s, aod.bytes.array, aod.bytes.num);
array_output_serializer_reset(&aod);
}
mux->fragments_written++;
/* --------------------------------------------------------- */
/* Analyse packets and create fragment moof. */
uint64_t mdat_size = 8;
for (size_t idx = 0; idx < mux->tracks.num; idx++) {
struct mp4_track *track = &mux->tracks.array[idx];
process_packets(mux, track, &mdat_size);
}
if (!mux->next_frag_pts && mux->chapter_track) {
// Create dummy chapter marker at the end so duration is correct
uint64_t duration = get_longest_track_duration(mux);
struct encoder_packet pkt;
mp4_create_chapter_pkt(&pkt, (int64_t)duration * 1000, "Dummy");
deque_push_back(&mux->chapter_track->packets, &pkt,
sizeof(struct encoder_packet));
process_packets(mux, mux->chapter_track, &mdat_size);
}
// write moof once to get size
int64_t moof_start = serializer_get_pos(s);
size_t moof_size = mp4_write_moof(mux, 0, moof_start);
array_output_serializer_reset(&aod);
// write moof again with known size
mp4_write_moof(mux, (uint32_t)moof_size, moof_start);
// Write to output and restore real serializer
s_write(s, aod.bytes.array, aod.bytes.num);
mux->serializer = s;
array_output_serializer_free(&aod);
/* --------------------------------------------------------- */
/* Write audio and video samples (in chunks). Also update */
/* global chunk and sample information for final moov. */
if (mdat_size > UINT32_MAX) {
s_wb32(s, 1);
s_write(s, "mdat", 4);
s_wb64(s, mdat_size + 8);
} else {
s_wb32(s, (uint32_t)mdat_size);
s_write(s, "mdat", 4);
}
for (size_t i = 0; i < mux->tracks.num; i++) {
struct mp4_track *track = &mux->tracks.array[i];
write_packets(mux, track);
}
/* Only write chapter packets on final flush. */
if (!mux->next_frag_pts && mux->chapter_track)
write_packets(mux, mux->chapter_track);
mux->next_frag_pts = 0;
}
/* ========================================================================== */
/* Track object functions */
static inline void track_insert_packet(struct mp4_track *track,
struct encoder_packet *pkt)
{
int64_t pts_usec = packet_pts_usec(pkt);
if (pts_usec > track->last_pts_usec)
track->last_pts_usec = pts_usec;
deque_push_back(&track->packets, pkt, sizeof(struct encoder_packet));
}
static inline uint32_t get_sample_size(struct mp4_track *track)
{
audio_t *audio = obs_encoder_audio(track->encoder);
if (!audio)
return 0;
const struct audio_output_info *info = audio_output_get_info(audio);
uint32_t channels = get_audio_channels(info->speakers);
switch (track->codec) {
case CODEC_PCM_F32:
return channels * 4; // 4 bytes per sample (32-bit)
case CODEC_PCM_I24:
return channels * 3; // 3 bytes per sample (24-bit)
case CODEC_PCM_I16:
return channels * 2; // 2 bytes per sample (16-bit)
default:
return 0;
}
}
static inline enum mp4_codec get_codec(obs_encoder_t *enc)
{
const char *codec = obs_encoder_get_codec(enc);
if (strcmp(codec, "h264") == 0)
return CODEC_H264;
if (strcmp(codec, "hevc") == 0)
return CODEC_HEVC;
if (strcmp(codec, "av1") == 0)
return CODEC_AV1;
if (strcmp(codec, "aac") == 0)
return CODEC_AAC;
if (strcmp(codec, "opus") == 0)
return CODEC_OPUS;
if (strcmp(codec, "flac") == 0)
return CODEC_FLAC;
if (strcmp(codec, "alac") == 0)
return CODEC_ALAC;
if (strcmp(codec, "pcm_s16le") == 0)
return CODEC_PCM_I16;
if (strcmp(codec, "pcm_s24le") == 0)
return CODEC_PCM_I24;
if (strcmp(codec, "pcm_f32le") == 0)
return CODEC_PCM_F32;
return CODEC_UNKNOWN;
}
static inline void add_track(struct mp4_mux *mux, obs_encoder_t *enc)
{
struct mp4_track *track = da_push_back_new(mux->tracks);
track->type = obs_encoder_get_type(enc) == OBS_ENCODER_VIDEO
? TRACK_VIDEO
: TRACK_AUDIO;
track->encoder = obs_encoder_get_ref(enc);
track->codec = get_codec(enc);
track->track_id = ++mux->track_ctr;
/* Set timebase/timescale */
if (track->type == TRACK_VIDEO) {
video_t *video = obs_encoder_video(enc);
const struct video_output_info *info =
video_output_get_info(video);
track->timebase_num = info->fps_den;
track->timebase_den = info->fps_num;
track->timescale = track->timebase_den;
/* FFmpeg does this to compensate for non-monotonic timestamps,
* we probably don't need it, but let's stick to what they do
* for maximum compatibility. */
while (track->timescale < 10000)
track->timescale *= 2;
} else {
uint32_t sample_rate = obs_encoder_get_sample_rate(enc);
/* Opus is always 48 kHz */
if (track->codec == CODEC_OPUS)
sample_rate = 48000;
track->timebase_num = 1;
track->timebase_den = sample_rate;
track->timescale = sample_rate;
}
/* Set sample size (if fixed) */
if (track->type == TRACK_AUDIO)
track->sample_size = get_sample_size(track);
}
static inline void add_chapter_track(struct mp4_mux *mux)
{
mux->chapter_track = bzalloc(sizeof(struct mp4_track));
mux->chapter_track->type = TRACK_CHAPTERS;
mux->chapter_track->codec = CODEC_TEXT;
mux->chapter_track->timescale = 1000;
mux->chapter_track->timebase_num = 1;
mux->chapter_track->timebase_den = 1000;
mux->chapter_track->track_id = ++mux->track_ctr;
}
static inline void free_packets(struct deque *dq)
{
size_t num = dq->size / sizeof(struct encoder_packet);
for (size_t i = 0; i < num; i++) {
struct encoder_packet pkt;
deque_pop_front(dq, &pkt, sizeof(struct encoder_packet));
obs_encoder_packet_release(&pkt);
}
}
static inline void free_track(struct mp4_track *track)
{
if (!track)
return;
obs_encoder_release(track->encoder);
free_packets(&track->packets);
deque_free(&track->packets);
da_free(track->sample_sizes);
da_free(track->chunks);
da_free(track->deltas);
da_free(track->offsets);
da_free(track->sync_samples);
da_free(track->fragment_samples);
}
/* ===========================================================================*/
/* API */
struct mp4_mux *mp4_mux_create(obs_output_t *output,
struct serializer *serializer,
enum mp4_mux_flags flags)
{
struct mp4_mux *mux = bzalloc(sizeof(struct mp4_mux));
mux->output = output;
mux->serializer = serializer;
mux->flags = flags;
/* Timestamp is based on 1904 rather than 1970. */
mux->creation_time = time(NULL) + 0x7C25B080;
for (size_t i = 0; i < MAX_OUTPUT_VIDEO_ENCODERS; i++) {
obs_encoder_t *enc = obs_output_get_video_encoder2(output, i);
if (!enc)
continue;
add_track(mux, enc);
}
for (size_t i = 0; i < MAX_OUTPUT_AUDIO_ENCODERS; i++) {
obs_encoder_t *enc = obs_output_get_audio_encoder(output, i);
if (!enc)
continue;
add_track(mux, enc);
}
return mux;
}
void mp4_mux_destroy(struct mp4_mux *mux)
{
for (size_t i = 0; i < mux->tracks.num; i++)
free_track(&mux->tracks.array[i]);
free_track(mux->chapter_track);
bfree(mux->chapter_track);
da_free(mux->tracks);
bfree(mux);
}
bool mp4_mux_submit_packet(struct mp4_mux *mux, struct encoder_packet *pkt)
{
struct mp4_track *track = NULL;
struct encoder_packet parsed_packet;
enum obs_encoder_type type = pkt->type;
bool fragment_ready = mux->next_frag_pts > 0;
for (size_t i = 0; i < mux->tracks.num; i++) {
struct mp4_track *tmp = &mux->tracks.array[i];
fragment_ready = fragment_ready &&
tmp->last_pts_usec >= mux->next_frag_pts;
if (tmp->encoder == pkt->encoder)
track = tmp;
}
if (!track) {
warn("Could not find track for packet of type %s with "
"track id %zu!",
type == OBS_ENCODER_VIDEO ? "video" : "audio",
pkt->track_idx);
return false;
}
/* If all tracks have caught up to the keyframe we want to fragment on,
* flush the current fragment to disk. */
if (fragment_ready)
mp4_flush_fragment(mux);
if (type == OBS_ENCODER_AUDIO) {
obs_encoder_packet_ref(&parsed_packet, pkt);
} else {
if (track->codec == CODEC_H264)
obs_parse_avc_packet(&parsed_packet, pkt);
else if (track->codec == CODEC_HEVC)
obs_parse_hevc_packet(&parsed_packet, pkt);
else if (track->codec == CODEC_AV1)
obs_parse_av1_packet(&parsed_packet, pkt);
/* Set fragmentation PTS if packet is keyframe and PTS > 0 */
if (parsed_packet.keyframe && parsed_packet.pts > 0) {
mux->next_frag_pts = packet_pts_usec(&parsed_packet);
}
}
track_insert_packet(track, &parsed_packet);
return true;
}
bool mp4_mux_add_chapter(struct mp4_mux *mux, int64_t dts_usec,
const char *name)
{
if (dts_usec < 0)
return false;
if (!mux->chapter_track)
add_chapter_track(mux);
/* To work correctly there needs to be a chapter at PTS 0,
* create that here if necessary. */
if (dts_usec > 0 && mux->chapter_track->packets.size == 0) {
mp4_mux_add_chapter(mux, 0,
obs_module_text("MP4Output.StartChapter"));
}
/* Create packets that will be muxed on final flush */
struct encoder_packet pkt;
mp4_create_chapter_pkt(&pkt, dts_usec, name);
track_insert_packet(mux->chapter_track, &pkt);
return true;
}
bool mp4_mux_finalise(struct mp4_mux *mux)
{
struct serializer *s = mux->serializer;
/* Flush remaining audio/video samples as final fragment. */
info("Flushing final fragment...");
/* Set target PTS to zero to indicate that we want to flush all
* the remaining packets */
mux->next_frag_pts = 0;
mp4_flush_fragment(mux);
info("Number of fragments: %u", mux->fragments_written);
if (mux->flags & MP4_SKIP_FINALISATION) {
warn("Skipping MP4 finalization!");
return true;
}
int64_t data_end = serializer_get_pos(s);
/* ---------------------------------------- */
/* Write full moov box */
/* Use array serializer for moov data as this will do a lot
* of seeks to write size values of variable-size boxes. */
struct serializer fs;
struct array_output_data ao;
array_output_serializer_init(&fs, &ao);
mux->serializer = &fs;
mp4_write_moov(mux, false);
s_write(s, ao.bytes.array, ao.bytes.num);
info("Full moov size: %zu KiB", ao.bytes.num / 1024);
mux->serializer = s; // restore real serializer
array_output_serializer_free(&ao);
/* ---------------------------------------- */
/* Overwrite file header (ftyp + free/moov) */
serializer_seek(s, 0, SERIALIZE_SEEK_START);
mp4_write_ftyp(mux, false);
size_t data_size = data_end - mux->placeholder_offset;
serializer_seek(s, (int64_t)mux->placeholder_offset,
SERIALIZE_SEEK_START);
/* If data is more than 4 GiB the mdat header becomes 16 bytes, hence
* why we create a 16-byte placeholder "free" box at the start. */
if (data_size > UINT32_MAX) {
s_wb32(s, 1); // 1 = use "largesize" field instead
s_write(s, "mdat", 4);
s_wb64(s, data_size); // largesize (64-bit)
} else {
s_wb32(s, (uint32_t)data_size);
s_write(s, "mdat", 4);
}
info("Final mdat size: %zu KiB", data_size / 1024);
return true;
}