obs-studio/libobs/media-io/format-conversion.c

325 lines
11 KiB
C
Raw Normal View History

2013-10-25 17:23:11 +00:00
/******************************************************************************
Copyright (C) 2013 by Hugh Bailey <obs.jim@gmail.com>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 2 of the License, or
2013-10-25 17:23:11 +00:00
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
******************************************************************************/
#include "format-conversion.h"
#include <xmmintrin.h>
#include <emmintrin.h>
/* ...surprisingly, if I don't use a macro to force inlining, it causes the
* CPU usage to boost by a tremendous amount in debug builds. */
Revamp API and start using doxygen The API used to be designed in such a way to where it would expect exports for each individual source/output/encoder/etc. You would export functions for each and it would automatically load those functions based on a specific naming scheme from the module. The idea behind this was that I wanted to limit the usage of structures in the API so only functions could be used. It was an interesting idea in theory, but this idea turned out to be flawed in a number of ways: 1.) Requiring exports to create sources/outputs/encoders/etc meant that you could not create them by any other means, which meant that things like faruton's .net plugin would become difficult. 2.) Export function declarations could not be checked, therefore if you created a function with the wrong parameters and parameter types, the compiler wouldn't know how to check for that. 3.) Required overly complex load functions in libobs just to handle it. It makes much more sense to just have a load function that you call manually. Complexity is the bane of all good programs. 4.) It required that you have functions of specific names, which looked and felt somewhat unsightly. So, to fix these issues, I replaced it with a more commonly used API scheme, seen commonly in places like kernels and typical C libraries with abstraction. You simply create a structure that contains the callback definitions, and you pass it to a function to register that definition (such as obs_register_source), which you call in the obs_module_load of the module. It will also automatically check the structure size and ensure that it only loads the required values if the structure happened to add new values in an API change. The "main" source file for each module must include obs-module.h, and must use OBS_DECLARE_MODULE() within that source file. Also, started writing some doxygen documentation in to the main library headers. Will add more detailed documentation as I go.
2014-02-12 15:04:50 +00:00
#define get_m128_32_0(val) (*((uint32_t*)&val))
#define get_m128_32_1(val) (*(((uint32_t*)&val)+1))
2013-10-25 17:23:11 +00:00
#define pack_shift(lum_plane, lum_pos0, lum_pos1, line1, line2, mask, sh) \
do { \
__m128i pack_val = _mm_packs_epi32( \
_mm_srli_si128(_mm_and_si128(line1, mask), sh), \
_mm_srli_si128(_mm_and_si128(line2, mask), sh)); \
pack_val = _mm_packus_epi16(pack_val, pack_val); \
\
*(uint32_t*)(lum_plane+lum_pos0) = get_m128_32_0(pack_val); \
*(uint32_t*)(lum_plane+lum_pos1) = get_m128_32_1(pack_val); \
} while (false)
#define pack_val(lum_plane, lum_pos0, lum_pos1, line1, line2, mask) \
do { \
__m128i pack_val = _mm_packs_epi32( \
_mm_and_si128(line1, mask), \
_mm_and_si128(line2, mask)); \
pack_val = _mm_packus_epi16(pack_val, pack_val); \
\
*(uint32_t*)(lum_plane+lum_pos0) = get_m128_32_0(pack_val); \
*(uint32_t*)(lum_plane+lum_pos1) = get_m128_32_1(pack_val); \
} while (false)
#define pack_ch_1plane(uv_plane, chroma_pos, line1, line2, uv_mask) \
do { \
__m128i add_val = _mm_add_epi64( \
_mm_and_si128(line1, uv_mask), \
_mm_and_si128(line2, uv_mask)); \
__m128i avg_val = _mm_add_epi64( \
add_val, \
_mm_shuffle_epi32(add_val, _MM_SHUFFLE(2, 3, 0, 1))); \
avg_val = _mm_srai_epi16(avg_val, 2); \
avg_val = _mm_shuffle_epi32(avg_val, _MM_SHUFFLE(3, 1, 2, 0)); \
avg_val = _mm_packus_epi16(avg_val, avg_val); \
\
*(uint32_t*)(uv_plane+chroma_pos) = get_m128_32_0(avg_val); \
} while (false)
#define pack_ch_2plane(u_plane, v_plane, chroma_pos, line1, line2, uv_mask) \
do { \
uint32_t packed_vals; \
\
__m128i add_val = _mm_add_epi64( \
_mm_and_si128(line1, uv_mask), \
_mm_and_si128(line2, uv_mask)); \
__m128i avg_val = _mm_add_epi64( \
add_val, \
_mm_shuffle_epi32(add_val, _MM_SHUFFLE(2, 3, 0, 1))); \
avg_val = _mm_srai_epi16(avg_val, 2); \
avg_val = _mm_shuffle_epi32(avg_val, _MM_SHUFFLE(3, 1, 2, 0)); \
avg_val = _mm_shufflelo_epi16(avg_val, _MM_SHUFFLE(3, 1, 2, 0)); \
avg_val = _mm_packus_epi16(avg_val, avg_val); \
\
packed_vals = get_m128_32_0(avg_val); \
\
*(uint16_t*)(u_plane+chroma_pos) = (uint16_t)(packed_vals); \
*(uint16_t*)(v_plane+chroma_pos) = (uint16_t)(packed_vals>>16); \
} while (false)
static FORCE_INLINE uint32_t min_uint32(uint32_t a, uint32_t b)
2013-10-25 17:23:11 +00:00
{
return a < b ? a : b;
2013-10-25 17:23:11 +00:00
}
void compress_uyvx_to_i420(
const uint8_t *input, uint32_t in_linesize,
uint32_t start_y, uint32_t end_y,
uint8_t *output[], const uint32_t out_linesize[])
2013-10-25 17:23:11 +00:00
{
uint8_t *lum_plane = output[0];
uint8_t *u_plane = output[1];
uint8_t *v_plane = output[2];
uint32_t width = min_uint32(in_linesize, out_linesize[0]);
2013-10-25 17:23:11 +00:00
uint32_t y;
__m128i lum_mask = _mm_set1_epi32(0x0000FF00);
__m128i uv_mask = _mm_set1_epi16(0x00FF);
for (y = start_y; y < end_y; y += 2) {
uint32_t y_pos = y * in_linesize;
uint32_t chroma_y_pos = (y>>1) * out_linesize[1];
uint32_t lum_y_pos = y * out_linesize[0];
2013-10-25 17:23:11 +00:00
uint32_t x;
for (x = 0; x < width; x += 4) {
const uint8_t *img = input + y_pos + x*4;
uint32_t lum_pos0 = lum_y_pos + x;
uint32_t lum_pos1 = lum_pos0 + out_linesize[0];
2013-10-25 17:23:11 +00:00
__m128i line1 = _mm_load_si128((const __m128i*)img);
__m128i line2 = _mm_load_si128(
(const __m128i*)(img + in_linesize));
2013-10-25 17:23:11 +00:00
pack_shift(lum_plane, lum_pos0, lum_pos1,
line1, line2, lum_mask, 1);
pack_ch_2plane(u_plane, v_plane,
2013-10-25 17:23:11 +00:00
chroma_y_pos + (x>>1),
line1, line2, uv_mask);
}
}
}
void compress_uyvx_to_nv12(
const uint8_t *input, uint32_t in_linesize,
uint32_t start_y, uint32_t end_y,
uint8_t *output[], const uint32_t out_linesize[])
2013-10-25 17:23:11 +00:00
{
uint8_t *lum_plane = output[0];
uint8_t *chroma_plane = output[1];
uint32_t width = min_uint32(in_linesize, out_linesize[0]);
2013-10-25 17:23:11 +00:00
uint32_t y;
__m128i lum_mask = _mm_set1_epi32(0x0000FF00);
__m128i uv_mask = _mm_set1_epi16(0x00FF);
for (y = start_y; y < end_y; y += 2) {
uint32_t y_pos = y * in_linesize;
uint32_t chroma_y_pos = (y>>1) * out_linesize[1];
uint32_t lum_y_pos = y * out_linesize[0];
2013-10-25 17:23:11 +00:00
uint32_t x;
for (x = 0; x < width; x += 4) {
const uint8_t *img = input + y_pos + x*4;
uint32_t lum_pos0 = lum_y_pos + x;
uint32_t lum_pos1 = lum_pos0 + out_linesize[0];
2013-10-25 17:23:11 +00:00
__m128i line1 = _mm_load_si128((const __m128i*)img);
__m128i line2 = _mm_load_si128(
(const __m128i*)(img + in_linesize));
2013-10-25 17:23:11 +00:00
pack_shift(lum_plane, lum_pos0, lum_pos1,
line1, line2, lum_mask, 1);
pack_ch_1plane(chroma_plane, chroma_y_pos + x,
2013-10-25 17:23:11 +00:00
line1, line2, uv_mask);
}
}
}
void convert_uyvx_to_i444(
const uint8_t *input, uint32_t in_linesize,
uint32_t start_y, uint32_t end_y,
uint8_t *output[], const uint32_t out_linesize[])
{
uint8_t *lum_plane = output[0];
uint8_t *u_plane = output[1];
uint8_t *v_plane = output[2];
uint32_t width = min_uint32(in_linesize, out_linesize[0]);
uint32_t y;
__m128i lum_mask = _mm_set1_epi32(0x0000FF00);
__m128i u_mask = _mm_set1_epi32(0x000000FF);
__m128i v_mask = _mm_set1_epi32(0x00FF0000);
for (y = start_y; y < end_y; y += 2) {
uint32_t y_pos = y * in_linesize;
uint32_t lum_y_pos = y * out_linesize[0];
uint32_t x;
for (x = 0; x < width; x += 4) {
const uint8_t *img = input + y_pos + x*4;
uint32_t lum_pos0 = lum_y_pos + x;
uint32_t lum_pos1 = lum_pos0 + out_linesize[0];
__m128i line1 = _mm_load_si128((const __m128i*)img);
__m128i line2 = _mm_load_si128(
(const __m128i*)(img + in_linesize));
pack_shift(lum_plane, lum_pos0, lum_pos1,
line1, line2, lum_mask, 1);
pack_val(u_plane, lum_pos0, lum_pos1,
line1, line2, u_mask);
pack_shift(v_plane, lum_pos0, lum_pos1,
line1, line2, v_mask, 2);
}
}
}
void decompress_420(
const uint8_t *const input[], const uint32_t in_linesize[],
uint32_t start_y, uint32_t end_y,
uint8_t *output, uint32_t out_linesize)
2013-10-25 17:23:11 +00:00
{
uint32_t start_y_d2 = start_y/2;
uint32_t width_d2 = in_linesize[0]/2;
2013-10-25 17:23:11 +00:00
uint32_t height_d2 = end_y/2;
uint32_t y;
for (y = start_y_d2; y < height_d2; y++) {
const uint8_t *chroma0 = input[1] + y * in_linesize[1];
const uint8_t *chroma1 = input[2] + y * in_linesize[2];
2013-10-25 17:23:11 +00:00
register const uint8_t *lum0, *lum1;
register uint32_t *output0, *output1;
uint32_t x;
lum0 = input[0] + y * 2 * in_linesize[0];
lum1 = lum0 + in_linesize[0];
output0 = (uint32_t*)(output + y * 2 * out_linesize);
output1 = (uint32_t*)((uint8_t*)output0 + out_linesize);
2013-10-25 17:23:11 +00:00
for (x = 0; x < width_d2; x++) {
uint32_t out;
out = (*(chroma0++) << 8) | *(chroma1++);
2013-10-25 17:23:11 +00:00
*(output0++) = (*(lum0++) << 16) | out;
*(output0++) = (*(lum0++) << 16) | out;
2013-10-25 17:23:11 +00:00
*(output1++) = (*(lum1++) << 16) | out;
*(output1++) = (*(lum1++) << 16) | out;
2013-10-25 17:23:11 +00:00
}
}
}
void decompress_nv12(
const uint8_t *const input[], const uint32_t in_linesize[],
uint32_t start_y, uint32_t end_y,
uint8_t *output, uint32_t out_linesize)
{
uint32_t start_y_d2 = start_y/2;
uint32_t width_d2 = min_uint32(in_linesize[0], out_linesize)/2;
uint32_t height_d2 = end_y/2;
uint32_t y;
for (y = start_y_d2; y < height_d2; y++) {
const uint16_t *chroma;
register const uint8_t *lum0, *lum1;
register uint32_t *output0, *output1;
uint32_t x;
chroma = (const uint16_t*)(input[1] + y * in_linesize[1]);
lum0 = input[0] + y * 2 * in_linesize[0];
lum1 = lum0 + in_linesize[0];
output0 = (uint32_t*)(output + y * 2 * out_linesize);
output1 = (uint32_t*)((uint8_t*)output0 + out_linesize);
for (x = 0; x < width_d2; x++) {
uint32_t out = *(chroma++) << 8;
*(output0++) = *(lum0++) | out;
*(output0++) = *(lum0++) | out;
*(output1++) = *(lum1++) | out;
*(output1++) = *(lum1++) | out;
}
}
}
void decompress_422(
const uint8_t *input, uint32_t in_linesize,
uint32_t start_y, uint32_t end_y,
uint8_t *output, uint32_t out_linesize,
bool leading_lum)
2013-10-25 17:23:11 +00:00
{
uint32_t width_d2 = min_uint32(in_linesize, out_linesize)/2;
2013-10-25 17:23:11 +00:00
uint32_t y;
register const uint32_t *input32;
register const uint32_t *input32_end;
register uint32_t *output32;
if (leading_lum) {
for (y = start_y; y < end_y; y++) {
input32 = (const uint32_t*)(input + y*in_linesize);
2013-10-25 17:23:11 +00:00
input32_end = input32 + width_d2;
output32 = (uint32_t*)(output + y*out_linesize);
2013-10-25 17:23:11 +00:00
while(input32 < input32_end) {
register uint32_t dw = *input32;
output32[0] = dw;
dw &= 0xFFFFFF00;
dw |= (uint8_t)(dw>>16);
output32[1] = dw;
output32 += 2;
input32++;
}
}
} else {
for (y = start_y; y < end_y; y++) {
input32 = (const uint32_t*)(input + y*in_linesize);
2013-10-25 17:23:11 +00:00
input32_end = input32 + width_d2;
output32 = (uint32_t*)(output + y*out_linesize);
2013-10-25 17:23:11 +00:00
while (input32 < input32_end) {
register uint32_t dw = *input32;
output32[0] = dw;
dw &= 0xFFFF00FF;
dw |= (dw>>16) & 0xFF00;
output32[1] = dw;
output32 += 2;
input32++;
}
}
}
}