File: bundled_shaders.cpp

package info (click to toggle)
movit 1.7.2-1
links: PTS
area: main
in suites: forky, sid
size: 3,248 kB
sloc: cpp: 16,677; sh: 3,940; makefile: 167
file content (57 lines) | stat: -rw-r--r-- 49,793 bytes
// Autogenerated by make_bundled_shaders.cpp. Do not edit by hand!
#include <string>
#include "bundled_shaders.h"

namespace movit {

BundledShader bundled_shaders[] = {
	{ "vs.vert", 0, 575 },
	{ "vs.130.vert", 575, 571 },
	{ "vs.150.vert", 1146, 571 },
	{ "vs.300es.vert", 1717, 598 },
	{ "header.130.frag", 2315, 94 },
	{ "header.150.frag", 2409, 94 },
	{ "header.300es.frag", 2503, 121 },
	{ "header.comp", 2624, 617 },
	{ "footer.frag", 3241, 1952 },
	{ "identity.frag", 5193, 107 },
	{ "footer.comp", 5300, 1050 },
	{ "texture1d.130.frag", 6350, 154 },
	{ "texture1d.150.frag", 6504, 154 },
	{ "texture1d.300es.frag", 6658, 181 },
	{ "flat_input.frag", 6839, 536 },
	{ "ycbcr_input.frag", 7375, 1262 },
	{ "ycbcr_422interleaved_input.frag", 8637, 813 },
	{ "lift_gamma_gain_effect.frag", 9450, 632 },
	{ "white_balance_effect.frag", 10082, 177 },
	{ "gamma_expansion_effect.frag", 10259, 542 },
	{ "gamma_compression_effect.frag", 10801, 738 },
	{ "colorspace_conversion_effect.frag", 11539, 246 },
	{ "alpha_multiplication_effect.frag", 11785, 77 },
	{ "alpha_division_effect.frag", 11862, 262 },
	{ "saturation_effect.frag", 12124, 223 },
	{ "blur_effect.frag", 12347, 661 },
	{ "mix_effect.frag", 13008, 1772 },
	{ "overlay_effect.frag", 14780, 929 },
	{ "padding_effect.frag", 15709, 843 },
	{ "resample_effect.frag", 16552, 2429 },
	{ "dither_effect.frag", 18981, 1561 },
	{ "deconvolution_sharpen_effect.frag", 20542, 1457 },
	{ "fft_pass_effect.frag", 21999, 952 },
	{ "vignette_effect.frag", 22951, 627 },
	{ "slice_effect.frag", 23578, 1078 },
	{ "complex_modulate_effect.frag", 24656, 331 },
	{ "luma_mix_effect.frag", 24987, 1795 },
	{ "ycbcr_conversion_effect.frag", 26782, 932 },
	{ "deinterlace_effect.frag", 27714, 8274 },
	{ "sandbox_effect.frag", 35988, 148 },
	{ "mirror_effect.frag", 36136, 101 },
	{ "multiply_effect.frag", 36237, 63 },
	{ "deinterlace_effect.comp", 36300, 8645 },
	{ "highlight_cutoff_effect.frag", 44945, 79 },
	{ "overlay_matte_effect.frag", 45024, 235 },
	{ nullptr, 0, 0 }
};
const char *shader_bundle = "attribute vec2 position;\nattribute vec2 texcoord;\nvarying vec2 tc;\n\n// Will be overridden by compile_glsl_program() if needed.\n// (It cannot just be prepended, as #version must be before everything.)\n#define FLIP_ORIGIN 0\n\nvoid main()\n{\n\t// The result of glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0) is:\n\t//\n\t//   2.000  0.000  0.000 -1.000\n\t//   0.000  2.000  0.000 -1.000\n\t//   0.000  0.000 -2.000 -1.000\n\t//   0.000  0.000  0.000  1.000\n\tgl_Position = vec4(2.0 * position.x - 1.0, 2.0 * position.y - 1.0, -1.0, 1.0);\n\ttc = texcoord;\n#if FLIP_ORIGIN\n\ttc.y = 1.0f - tc.y;\n#endif\n}\n#version 130\n\nin vec2 position;\nin vec2 texcoord;\nout vec2 tc;\n\n// Will be overridden by compile_glsl_program() if needed.\n// (It cannot just be prepended, as #version must be before everything.)\n#define FLIP_ORIGIN 0\n\nvoid main()\n{\n\t// The result of glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0) is:\n\t//\n\t//   2.000  0.000  0.000 -1.000\n\t//   0.000  2.000  0.000 -1.000\n\t//   0.000  0.000 -2.000 -1.000\n\t//   0.000  0.000  0.000  1.000\n\tgl_Position = vec4(2.0 * position.x - 1.0, 2.0 * position.y - 1.0, -1.0, 1.0);\n\ttc = texcoord;\n#if FLIP_ORIGIN\n\ttc.y = 1.0f - tc.y;\n#endif\n}\n#version 150\n\nin vec2 position;\nin vec2 texcoord;\nout vec2 tc;\n\n// Will be overridden by compile_glsl_program() if needed.\n// (It cannot just be prepended, as #version must be before everything.)\n#define FLIP_ORIGIN 0\n\nvoid main()\n{\n\t// The result of glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0) is:\n\t//\n\t//   2.000  0.000  0.000 -1.000\n\t//   0.000  2.000  0.000 -1.000\n\t//   0.000  0.000 -2.000 -1.000\n\t//   0.000  0.000  0.000  1.000\n\tgl_Position = vec4(2.0 * position.x - 1.0, 2.0 * position.y - 1.0, -1.0, 1.0);\n\ttc = texcoord;\n#if FLIP_ORIGIN\n\ttc.y = 1.0f - tc.y;\n#endif\n}\n#version 300 es\n\nprecision highp float;\n\nin vec2 position;\nin vec2 texcoord;\nout vec2 tc;\n\n// Will be overridden by compile_glsl_program() if needed.\n// (It cannot just be prepended, as #version must be before everything.)\n#define FLIP_ORIGIN 0\n\nvoid main()\n{\n\t// The result of glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0) is:\n\t//\n\t//   2.000  0.000  0.000 -1.000\n\t//   0.000  2.000  0.000 -1.000\n\t//   0.000  0.000 -2.000 -1.000\n\t//   0.000  0.000  0.000  1.000\n\tgl_Position = vec4(2.0 * position.x - 1.0, 2.0 * position.y - 1.0, -1.0, 1.0);\n\ttc = texcoord;\n#if FLIP_ORIGIN\n\ttc.y = 1.0f - tc.y;\n#endif\n}\n#version 130\n\nin vec2 tc;\n\nvec4 tex2D(sampler2D s, vec2 coord)\n{\n\treturn texture(s, coord);\n}\n#version 150\n\nin vec2 tc;\n\nvec4 tex2D(sampler2D s, vec2 coord)\n{\n\treturn texture(s, coord);\n}\n#version 300 es\n\nprecision highp float;\n\nin vec2 tc;\n\nvec4 tex2D(sampler2D s, vec2 coord)\n{\n\treturn texture(s, coord);\n}\n#version 150\n#extension GL_ARB_compute_shader : enable\n#extension GL_ARB_shader_image_load_store : enable\n#extension GL_ARB_shader_image_size : enable\n\n// The texture the compute shader is writing to.\nuniform restrict writeonly image2D tex_outbuf;\n\n// Defined in footer.comp.\nvec4 tex2D(sampler2D s, vec2 coord);\nvoid cs_output(uvec2 coord, vec4 val);\nvoid cs_output(ivec2 coord, vec4 val);\n\n// Used if there are any steps used to postprocess compute shader output.\n// Initialized due to https://bugs.freedesktop.org/show_bug.cgi?id=103895.\nvec4 CS_OUTPUT_VAL = vec4(0.0);\n\n#define OUTPUT(tc, val) cs_output(tc, val)\n// GLSL is pickier than the C++ preprocessor in if-testing for undefined\n// tokens; do some fixups here to keep it happy.\n\n#ifndef YCBCR_OUTPUT_PLANAR\n#define YCBCR_OUTPUT_PLANAR 0\n#endif\n\n#ifndef YCBCR_OUTPUT_SPLIT_Y_AND_CBCR\n#define YCBCR_OUTPUT_SPLIT_Y_AND_CBCR 0\n#endif\n\n#ifndef SECOND_YCBCR_OUTPUT_PLANAR\n#define SECOND_YCBCR_OUTPUT_PLANAR 0\n#endif\n\n#ifndef SECOND_YCBCR_OUTPUT_SPLIT_Y_AND_CBCR\n#define SECOND_YCBCR_OUTPUT_SPLIT_Y_AND_CBCR 0\n#endif\n\n#ifndef SECOND_YCBCR_OUTPUT_INTERLEAVED\n#define SECOND_YCBCR_OUTPUT_INTERLEAVED 0\n#endif\n\n#ifndef YCBCR_ALSO_OUTPUT_RGBA\n#define YCBCR_ALSO_OUTPUT_RGBA 0\n#endif\n\n#ifndef SQUARE_ROOT_TRANSFORMATION\n#define SQUARE_ROOT_TRANSFORMATION 0\n#endif\n\n#if YCBCR_OUTPUT_PLANAR\nout vec4 Y, Cb, Cr;\n#elif YCBCR_OUTPUT_SPLIT_Y_AND_CBCR\nout vec4 Y, Chroma;\n#else\nout vec4 FragColor;  // Y'CbCr or RGBA.\n#endif\n\n#if SECOND_YCBCR_OUTPUT_PLANAR\nout vec4 Y2, Cb2, Cr2;\n#elif SECOND_YCBCR_OUTPUT_SPLIT_Y_AND_CBCR\nout vec4 Y2, Chroma2;\n#elif SECOND_YCBCR_OUTPUT_INTERLEAVED\nout vec4 YCbCr2;\n#endif\n\n#if YCBCR_ALSO_OUTPUT_RGBA\nout vec4 RGBA;\n#endif\n\nvoid main()\n{\n#if YCBCR_ALSO_OUTPUT_RGBA\n\tvec4 color[2] = INPUT(tc);\n\tvec4 color0 = color[0];\n\tvec4 color1 = color[1];\n#else\n\tvec4 color0 = INPUT(tc);\n#endif\n\n#if SQUARE_ROOT_TRANSFORMATION\n\t// Make sure we don't give negative values to sqrt.\n\tcolor0.rgb = sqrt(max(color0.rgb, 0.0));\n#endif\n\n#if YCBCR_OUTPUT_PLANAR\n\tY = color0.rrra;\n\tCb = color0.ggga;\n\tCr = color0.bbba;\n#elif YCBCR_OUTPUT_SPLIT_Y_AND_CBCR\n\tY = color0.rrra;\n\tChroma = color0.gbba;\n#else\n\tFragColor = color0;\n#endif\n\n\t// Exactly the same, just with other outputs.\n\t// (GLSL does not allow arrays of outputs.)\n#if SECOND_YCBCR_OUTPUT_PLANAR\n\tY2 = color0.rrra;\n\tCb2 = color0.ggga;\n\tCr2 = color0.bbba;\n#elif SECOND_YCBCR_OUTPUT_SPLIT_Y_AND_CBCR\n\tY2 = color0.rrra;\n\tChroma2 = color0.gbba;\n#elif SECOND_YCBCR_OUTPUT_INTERLEAVED\n\tYCbCr2 = color0;\n#endif\n\n#if YCBCR_ALSO_OUTPUT_RGBA\n\tRGBA = color1;\n#endif\n}\n// Identity transformation (sometimes useful to do nothing).\nvec4 FUNCNAME(vec2 tc)\n{\n\treturn INPUT(tc);\n}\n// GLSL is pickier than the C++ preprocessor in if-testing for undefined\n// tokens; do some fixups here to keep it happy.\n\n#ifndef SQUARE_ROOT_TRANSFORMATION\n#define SQUARE_ROOT_TRANSFORMATION 0\n#endif\n\n#ifndef FLIP_ORIGIN\n#define FLIP_ORIGIN 0\n#endif\n\nvoid main()\n{\n\tINPUT();\n}\n\nvec4 tex2D(sampler2D s, vec2 coord)\n{\n\treturn texture(s, coord);\n}\n\nvoid cs_output(uvec2 coord, vec4 val)\n{\n\tcs_output(ivec2(coord), val);\n}\n\nvoid cs_output(ivec2 coord, vec4 val)\n{\n\t// Run the value through any postprocessing steps we might have.\n\t// Note that we need to give in the actual coordinates, since the\n\t// effect could have multiple (non-compute) inputs, and would also\n\t// be allowed to make effects based on the texture coordinate alone.\n\tCS_OUTPUT_VAL = val;\n\tval = CS_POSTPROC(NORMALIZE_TEXTURE_COORDS(coord));\n\n#if SQUARE_ROOT_TRANSFORMATION\n\t// Make sure we don't give negative values to sqrt.\n\tval.rgb = sqrt(max(val.rgb, 0.0));\n#endif\n\n#if FLIP_ORIGIN\n\tcoord.y = imageSize(tex_outbuf).y - coord.y - 1;\n#endif\n\n\timageStore(tex_outbuf, coord, val);\n}\n#version 130\n\nuniform sampler2D tex;\nin vec2 tc;\n\nout vec4 FragColor;\n\nvoid main()\n{\n\tFragColor = texture(tex, tc);  // Second component is irrelevant.\n}\n#version 150\n\nuniform sampler2D tex;\nin vec2 tc;\n\nout vec4 FragColor;\n\nvoid main()\n{\n\tFragColor = texture(tex, tc);  // Second component is irrelevant.\n}\n#version 300 es\n\nprecision highp float;\n\nuniform sampler2D tex;\nin vec2 tc;\n\nout vec4 FragColor;\n\nvoid main()\n{\n\tFragColor = texture(tex, tc);  // Second component is irrelevant.\n}\n// Implicit uniforms:\n// uniform sampler2D PREFIX(tex);\n\nvec4 FUNCNAME(vec2 tc) {\n\t// OpenGL's origin is bottom-left, but most graphics software assumes\n\t// a top-left origin. Thus, for inputs that come from the user,\n\t// we flip the y coordinate.\n\ttc.y = 1.0 - tc.y;\n\n\tvec4 pixel = tex2D(PREFIX(tex), tc);\n\n\t// These two are #defined to 0 or 1 in flat_input.cpp.\n#if FIXUP_SWAP_RB\n\tpixel.rb = pixel.br;\n#endif\n#if FIXUP_RED_TO_GRAYSCALE\n\tpixel.gb = pixel.rr;\n#endif\n\treturn pixel;\n}\n\n#undef FIXUP_SWAP_RB\n#undef FIXUP_RED_TO_GRAYSCALE\n// Implicit uniforms:\n// uniform sampler2D PREFIX(tex_y);\n// uniform sampler2D PREFIX(tex_cbcr);  // If CB_CR_SAME_TEXTURE.\n// uniform sampler2D PREFIX(tex_cb);    // If not CB_CR_SAME_TEXTURE.\n// uniform sampler2D PREFIX(tex_cr);    // If not CB_CR_SAME_TEXTURE.\n// uniform mat3 PREFIX(ycbcr_matrix);\n// uniform vec3 PREFIX(offset);\n// uniform vec2 PREFIX(cb_offset);\n// uniform vec2 PREFIX(cr_offset);\n\nvec4 FUNCNAME(vec2 tc) {\n\t// OpenGL's origin is bottom-left, but most graphics software assumes\n\t// a top-left origin. Thus, for inputs that come from the user,\n\t// we flip the y coordinate.\n\ttc.y = 1.0 - tc.y;\n\n\tvec3 ycbcr;\n#if Y_CB_CR_SAME_TEXTURE\n\tycbcr = tex2D(PREFIX(tex_y), tc).xyz;\n#else\n\tycbcr.x = tex2D(PREFIX(tex_y), tc).x;\n  #if CB_CR_SAME_TEXTURE\n    #if CB_CR_OFFSETS_EQUAL\n\tycbcr.yz = tex2D(PREFIX(tex_cbcr), tc + PREFIX(cb_offset)).xy;\n    #else\n\tycbcr.y = tex2D(PREFIX(tex_cbcr), tc + PREFIX(cb_offset)).x;\n\tycbcr.z = tex2D(PREFIX(tex_cbcr), tc + PREFIX(cr_offset)).x;\n    #endif\n  #else\n\tycbcr.y = tex2D(PREFIX(tex_cb), tc + PREFIX(cb_offset)).x;\n\tycbcr.z = tex2D(PREFIX(tex_cr), tc + PREFIX(cr_offset)).x;\n  #endif\n#endif\n\n\tycbcr -= PREFIX(offset);\n\n\tvec4 rgba;\n\trgba.rgb = PREFIX(inv_ycbcr_matrix) * ycbcr;\n\trgba.a = 1.0;\n\treturn rgba;\n}\n// Implicit uniforms:\n// uniform sampler2D PREFIX(tex_y);\n// uniform sampler2D PREFIX(tex_cbcr);\n\nvec4 FUNCNAME(vec2 tc) {\n\t// OpenGL's origin is bottom-left, but most graphics software assumes\n\t// a top-left origin. Thus, for inputs that come from the user,\n\t// we flip the y coordinate.\n\ttc.y = 1.0 - tc.y;\n\n\tvec3 ycbcr;\n\tycbcr.x = tex2D(PREFIX(tex_y), tc).y;\n#if CB_CR_OFFSETS_EQUAL\n\tvec2 tc_cbcr = tc;\n\ttc_cbcr.x += PREFIX(cb_offset_x);\n\tycbcr.yz = tex2D(PREFIX(tex_cbcr), tc_cbcr).xz;\n#else\n\tvec2 tc_cb = tc;\n\ttc_cb.x += PREFIX(cb_offset_x);\n\tycbcr.y = tex2D(PREFIX(tex_cbcr), tc_cb).x;\n\n\tvec2 tc_cr = tc;\n\ttc_cr.x += PREFIX(cr_offset_x);\n\tycbcr.z = tex2D(PREFIX(tex_cbcr), tc_cr).z;\n#endif\n\n\tycbcr -= PREFIX(offset);\n\n\tvec4 rgba;\n\trgba.rgb = PREFIX(inv_ycbcr_matrix) * ycbcr;\n\trgba.a = 1.0;\n\treturn rgba;\n}\n// Implicit uniforms:\n//\n// These are calculated in the host code to save some arithmetic.\n// uniform vec3 PREFIX(gain_pow_inv_gamma);  // gain^(1/gamma).\n// uniform vec3 PREFIX(inv_gamma_22);  // 2.2 / gamma.\n\nvec4 FUNCNAME(vec2 tc) {\n\tvec4 x = INPUT(tc);\n\n\tx.rgb /= x.aaa;\n\n\t// pow() of negative numbers is undefined, so clip out-of-gamut values.\n\tx.rgb = max(x.rgb, 0.0);\n\n\tx.rgb = pow(x.rgb, vec3(1.0/2.2));\n\tx.rgb += PREFIX(lift) * (vec3(1) - x.rgb);\n\n\t// Clip out-of-gamut values again.\n\tx.rgb = max(x.rgb, 0.0);\n\n\tx.rgb = pow(x.rgb, PREFIX(inv_gamma_22));\n\tx.rgb *= PREFIX(gain_pow_inv_gamma);\n\tx.rgb *= x.aaa;\n\n\treturn x;\n}\n// Implicit uniforms:\n// uniform mat3 PREFIX(correction_matrix);\n\nvec4 FUNCNAME(vec2 tc) {\n\tvec4 ret = INPUT(tc);\n\tret.rgb = PREFIX(correction_matrix) * ret.rgb;\n\treturn ret;\n}\n// Expand gamma curve.\n\n// Implicit uniforms:\n// uniform float PREFIX(linear_scale);\n// uniform float PREFIX(c)[5];\n// uniform float PREFIX(beta);\n\nvec4 FUNCNAME(vec2 tc) {\n\tvec4 x = INPUT(tc);\n\n\tvec3 a = x.rgb * PREFIX(linear_scale);\n\n\t// Fourth-order polynomial approximation to pow(). See the .cpp file for details.\n\tvec3 b = PREFIX(c[0]) + (PREFIX(c[1]) + (PREFIX(c[2]) + (PREFIX(c[3]) + PREFIX(c[4]) * x.rgb) * x.rgb) * x.rgb) * x.rgb;\n\n\tvec3 f = vec3(greaterThan(x.rgb, vec3(PREFIX(beta))));\n\tx = vec4(mix(a, b, f), x.a);\n\n\treturn x;\n}\n// Compress gamma curve.\n\n// Implicit uniforms:\n// uniform float PREFIX(linear_scale);\n// uniform float PREFIX(c)[5];\n// uniform float PREFIX(beta);\n\nvec4 FUNCNAME(vec2 tc) {\n\tvec4 x = INPUT(tc);\n\n\t// We could reasonably get values outside (0.0, 1.0), but the formulas below\n\t// are not valid outside that range, so clamp before we do anything else.\n\tx.rgb = clamp(x.rgb, 0.0, 1.0);\n\n\tvec3 a = x.rgb * PREFIX(linear_scale);\n\n\t// Fourth-order polynomial approximation to pow(). See the .cpp file for details.\n\tvec3 s = sqrt(x.rgb);\n\tvec3 b = PREFIX(c)[0] + (PREFIX(c)[1] + (PREFIX(c)[2] + (PREFIX(c)[3] + PREFIX(c)[4] * s) * s) * s) * s;\n\n\tvec3 f = vec3(greaterThan(x.rgb, vec3(PREFIX(beta))));\n\tx = vec4(mix(a, b, f), x.a);\n\n\treturn x;\n}\n// Colorspace conversion (needs to be done in linear space).\n// The matrix is computed on the host and baked into the shader at compile time.\n\nvec4 FUNCNAME(vec2 tc) {\n\tvec4 x = INPUT(tc);\n\tx.rgb = PREFIX(conversion_matrix) * x.rgb;\n\treturn x;\n}\nvec4 FUNCNAME(vec2 tc) {\n\tvec4 x = INPUT(tc);\n\tx.rgb *= x.aaa;\t\n\treturn x;\n}\n// Note: Division by zero will give inf or nan, whose conversion to\n// integer types is implementation-defined. However, anything is fine for\n// alpha=0, since that's undefined anyway.\nvec4 FUNCNAME(vec2 tc) {\n\tvec4 x = INPUT(tc);\n\tx.rgb /= x.aaa;\t\n\treturn x;\n}\n// Saturate/desaturate (in linear space).\n\nvec4 FUNCNAME(vec2 tc) {\n\tvec4 x = INPUT(tc);\n\n\tfloat luminance = dot(x.rgb, vec3(0.2126, 0.7152, 0.0722));\n\tx.rgb = mix(vec3(luminance), x.rgb, PREFIX(saturation));\n\n\treturn x;\n}\n// A simple un.directional blur.\n// DIRECTION_VERTICAL will be #defined to 1 if we are doing a vertical blur,\n// 0 otherwise.\n\n// Implicit uniforms:\n// uniform vec2 PREFIX(samples)[NUM_TAPS / 2 + 1];\n\nvec4 FUNCNAME(vec2 tc) {\n\tvec4 sum = vec4(PREFIX(samples)[0].y) * INPUT(tc);\n\tfor (int i = 1; i < NUM_TAPS / 2 + 1; ++i) {\n\t\tvec2 sample = PREFIX(samples)[i];\n\t\tvec2 sample1_tc = tc, sample2_tc = tc;\n#if DIRECTION_VERTICAL\n\t\tsample1_tc.y -= sample.x;\n\t\tsample2_tc.y += sample.x;\n#else\n\t\tsample1_tc.x -= sample.x;\n\t\tsample2_tc.x += sample.x;\n#endif\n\t\tsum += vec4(sample.y) * (INPUT(sample1_tc) + INPUT(sample2_tc));\n\t}\n\treturn sum;\n}\n\n#undef DIRECTION_VERTICAL\nvec4 FUNCNAME(vec2 tc) {\n\tvec4 first = INPUT1(tc);\n\tvec4 second = INPUT2(tc);\n\tvec4 result = vec4(PREFIX(strength_first)) * first + vec4(PREFIX(strength_second)) * second;\n\n\t// Clamping alpha at some stage, either here or in AlphaDivisionEffect,\n\t// is actually very important for some use cases. Consider, for instance,\n\t// the case where we have additive blending (strength_first = strength_second = 1),\n\t// and add two 50% gray 100% opaque (0.5, 0.5, 0.5, 1.0) pixels. Without\n\t// alpha clamping, we'd get (1.0, 1.0, 1.0, 2.0), which would then in\n\t// conversion to postmultiplied be divided back to (0.5, 0.5, 0.5)!\n\t// Clamping alpha to 1.0 fixes the problem, and we get the expected result\n\t// of (1.0, 1.0, 1.0). Similarly, adding (0.5, 0.5, 0.5, 0.5) to itself\n\t// yields (1.0, 1.0, 1.0, 1.0) (100% white 100% opaque), which makes sense.\n\t//\n\t// The classic way of doing additive blending with premultiplied alpha\n\t// is to give the additive component alpha=0, but this also doesn't make\n\t// sense in a world where we could end up postmultiplied; just consider\n\t// the case where we have first=(0, 0, 0, 0) (ie., completely transparent)\n\t// and second=(0.5, 0.5, 0.5, 0.5) (ie., white at 50% opacity).\n\t// Zeroing out the alpha of second would yield (0.5, 0.5, 0.5, 0.0),\n\t// which has undefined RGB values in postmultiplied storage; certainly\n\t// e.g. (0, 0, 0, 0) would not be an expected output. Also, it would\n\t// break the expectation that A+B = B+A.\n\t//\n\t// Note that we do _not_ clamp RGB, since it might be useful to have\n\t// out-of-gamut colors. We could choose to do the alpha clamping in\n\t// AlphaDivisionEffect instead, though; I haven't thought a lot about\n\t// if that would be better or not.\n\tresult.a = clamp(result.a, 0.0, 1.0);\n\n\treturn result;\n}\n// It's actually (but surprisingly) not correct to do a mix() here;\n// it would be if we had postmultiplied alpha and didn't have to worry\n// about alpha in the bottom layer, but given that we use premultiplied\n// alpha all over, top shouldn't actually be multiplied by anything.\n//\n// These formulas come from Wikipedia:\n//\n//   http://en.wikipedia.org/wiki/Alpha_compositing\n//\n// We use the associative version given. However, note that since we want\n// _output_ to be premultiplied, C_o from Wikipedia is not what we want,\n// but rather c_o (which is not explicitly given, but obviously is just\n// C_o without the division by alpha_o).\n\nvec4 FUNCNAME(vec2 tc) {\n// SWAP_INPUTS will be #defined to 1 if we want to swap the two inputs,\n#if SWAP_INPUTS\n\tvec4 bottom = INPUT2(tc);\n\tvec4 top = INPUT1(tc);\n#else\n\tvec4 bottom = INPUT1(tc);\n\tvec4 top = INPUT2(tc);\n#endif\n\treturn top + (1.0 - top.a) * bottom;\n}\n\n#undef SWAP_INPUTS\n// Implicit uniforms:\n// uniform vec2 PREFIX(offset);\n// uniform vec2 PREFIX(scale);\n//\n// uniform vec2 PREFIX(normalized_coords_to_texels);\n// uniform vec2 PREFIX(offset_bottomleft);\n// uniform vec2 PREFIX(offset_topright);\n\nvec4 FUNCNAME(vec2 tc) {\n\ttc -= PREFIX(offset);\n\ttc *= PREFIX(scale);\n\n\tvec2 tc_texels = tc * PREFIX(normalized_coords_to_texels);\n\tvec2 coverage_bottomleft = clamp(tc_texels + PREFIX(offset_bottomleft), 0.0f, 1.0f);\n\tvec2 coverage_topright = clamp(PREFIX(offset_topright) - tc_texels, 0.0f, 1.0f);\n\tvec2 coverage_both = coverage_bottomleft * coverage_topright;\n\tfloat coverage = coverage_both.x * coverage_both.y;\n\n\tif (coverage <= 0.0f) {\n\t\t// Short-circuit in case the underlying function is expensive to call.\n\t\treturn PREFIX(border_color);\n\t} else {\n\t\treturn mix(PREFIX(border_color), INPUT(tc), coverage);\n\t}\n}\n// DIRECTION_VERTICAL will be #defined to 1 if we are scaling vertically,\n// and 0 otherwise.\n\n// Implicit uniforms:\n// uniform sampler2D PREFIX(sample_tex);\n// uniform int PREFIX(num_samples);\n// uniform float PREFIX(num_loops);\n// uniform float PREFIX(sample_x_scale);\n// uniform float PREFIX(sample_x_offset);\n// uniform float PREFIX(slice_height);\n\n// We put the fractional part of the offset (-0.5 to 0.5 pixels) in the weights\n// because we have to (otherwise they'd do nothing). However, the support texture\n// has limited numerical precision; we'd need as much of it as we can for\n// getting the subpixel sampling right, and adding a large constant to each value\n// will reduce the precision further. Thus, the non-fractional part of the offset\n// is sent in through a uniform that we simply add in. (It should be said that\n// for high values of (dst_size/num_loop), we're pretty much hosed anyway wrt.\n// this accuracy.)\n//\n// Unfortunately, we cannot just do it at the beginning of the shader,\n// since the texcoord value is used to index into the support texture,\n// and if zoom != 1, the support texture will not wrap properly, causing\n// us to read the wrong texels. (Also remember that whole_pixel_offset is\n// measured in _input_ pixels and tc is in _output_ pixels, although we could\n// compensate for that.) However, the shader should be mostly bandwidth bound\n// and not ALU bound, so an extra add per sample shouldn't be too hopeless.\n//\n// Implicitly declared:\n// uniform float PREFIX(whole_pixel_offset);\n\n// Sample a single weight. First fetch information about where to sample\n// and the weight from sample_tex, and then read the pixel itself.\nvec4 PREFIX(do_sample)(vec2 tc, int i)\n{\n\tvec2 sample_tc;\n\tsample_tc.x = float(i) * PREFIX(sample_x_scale) + PREFIX(sample_x_offset);\n#if DIRECTION_VERTICAL\n\tsample_tc.y = tc.y * PREFIX(num_loops);\n#else\n\tsample_tc.y = tc.x * PREFIX(num_loops);\n#endif\n\tvec2 sample = tex2D(PREFIX(sample_tex), sample_tc).rg;\n\n#if DIRECTION_VERTICAL\n\ttc.y = sample.g + (floor(sample_tc.y) * PREFIX(slice_height) + PREFIX(whole_pixel_offset));\n#else\n\ttc.x = sample.g + (floor(sample_tc.y) * PREFIX(slice_height) + PREFIX(whole_pixel_offset));\n#endif\n\treturn vec4(sample.r) * INPUT(tc);\n}\n\nvec4 FUNCNAME(vec2 tc) {\n\tvec4 sum = PREFIX(do_sample)(tc, 0);\n\tfor (int i = 1; i < PREFIX(num_samples); ++i) {\n\t\tsum += PREFIX(do_sample)(tc, i);\n\t}\n\treturn sum;\n}\n\n#undef DIRECTION_VERTICAL\n// Implicit uniforms:\n// uniform sampler2D PREFIX(dither_tex);\n// uniform vec2 PREFIX(tc_scale);\n// uniform float PREFIX(round_fac), PREFIX(inv_round_fac);\n\n// See footer.frag for details about this if statement.\n#ifndef YCBCR_ALSO_OUTPUT_RGBA\n#define YCBCR_ALSO_OUTPUT_RGBA 0\n#endif\n\n#if YCBCR_ALSO_OUTPUT_RGBA\n\n// There are two values to dither; otherwise, exactly the same as the algorithm below\n// (so comments are not duplicated).\n\nvec4[2] FUNCNAME(vec2 tc) {\n\tvec4[2] result = INPUT(tc);\n\tfloat d = tex2D(PREFIX(dither_tex), tc * PREFIX(tc_scale)).x;\n\tresult[0].rgb += vec3(d);\n\tresult[1].rgb += vec3(d);\n\n#if NEED_EXPLICIT_ROUND\n\tresult[0] = round(result[0] * vec4(PREFIX(round_fac))) * vec4(PREFIX(inv_round_fac));\n\tresult[1] = round(result[1] * vec4(PREFIX(round_fac))) * vec4(PREFIX(inv_round_fac));\n#endif\n\n\treturn result;\n}\n\n#else\n\nvec4 FUNCNAME(vec2 tc) {\n\tvec4 result = INPUT(tc);\n\tfloat d = tex2D(PREFIX(dither_tex), tc * PREFIX(tc_scale)).x;\n\n\t// Don't dither alpha; the case of alpha=255 (1.0) is very important to us,\n\t// and if there's any inaccuracy earlier in the chain so that it becomes e.g.\n\t// 254.8, it's better to just get it rounded off than to dither and have it\n\t// possibly get down to 254. This is not the case for the color components.\n\tresult.rgb += vec3(d);\n\n\t// NEED_EXPLICIT_ROUND will be #defined to 1 if the GPU has inaccurate\n\t// fp32 -> int8 framebuffer rounding, and 0 otherwise.\n#if NEED_EXPLICIT_ROUND\n\tresult = round(result * vec4(PREFIX(round_fac))) * vec4(PREFIX(inv_round_fac));\n#endif\n\n\treturn result;\n}\n\n#endif\n// Implicit uniforms:\n// uniform vec4 PREFIX(samples)[(R + 1) * (R + 1)];\n\nvec4 FUNCNAME(vec2 tc) {\n\t// The full matrix has five different symmetry cases, that look like this:\n\t//\n\t// D D D C D D D\n\t// D D D C D D D\n\t// D D D C D D D\n\t// B B B A B B B\n\t// D D D C D D D\n\t// D D D C D D D\n\t// D D D C D D D\n\t//\n\t// We only store the lower-right part of the matrix:\n\t//\n\t// A B B B \n\t// C D D D\n\t// C D D D\n\t// C D D D\n\n\t// Case A: Top-left sample has no symmetry.\n\tvec4 sum = PREFIX(samples)[0].z * INPUT(tc);\n\n\t// Case B: Uppermost samples have left/right symmetry.\n\tfor (int x = 1; x <= R; ++x) {\n\t\tvec4 sample = PREFIX(samples)[x];\n\t\tsum += sample.z * (INPUT(tc - sample.xy) + INPUT(tc + sample.xy));\n\t}\n\n\t// Case C: Leftmost samples have top/bottom symmetry.\n\tfor (int y = 1; y <= R; ++y) {\n\t\tvec4 sample = PREFIX(samples)[y * (R + 1)];\n\t\tsum += sample.z * (INPUT(tc - sample.xy) + INPUT(tc + sample.xy));\n\t}\n\n\t// Case D: All other samples have four-way symmetry.\n\t// (Actually we have eight-way, but since we are using normalized\n\t// coordinates, we can't just flip x and y.)\n\tfor (int y = 1; y <= R; ++y) {\n\t\tfor (int x = 1; x <= R; ++x) {\n\t\t\tvec4 sample = PREFIX(samples)[y * (R + 1) + x];\n\t\t\tvec2 mirror_sample = vec2(sample.x, -sample.y);\n\n\t\t\tvec4 local_sum = INPUT(tc - sample.xy) + INPUT(tc + sample.xy);\n\t\t\tlocal_sum += INPUT(tc - mirror_sample.xy) + INPUT(tc + mirror_sample.xy);\n\t\t\tsum += sample.z * local_sum;\n\t\t}\n\t}\n\n\treturn sum;\n}\n\n#undef R\n// DIRECTION_VERTICAL will be #defined to 1 if we are doing a vertical FFT,\n// and 0 otherwise.\n\n// Implicit uniforms:\n// uniform float PREFIX(num_repeats);\n// uniform sampler2D PREFIX(support_tex);\n\nvec4 FUNCNAME(vec2 tc) {\n#if DIRECTION_VERTICAL\n\tvec4 support = tex2D(PREFIX(support_tex), vec2(tc.y * PREFIX(num_repeats), 0.0));\n        vec4 c1 = INPUT(vec2(tc.x, tc.y + support.x));\n        vec4 c2 = INPUT(vec2(tc.x, tc.y + support.y));\n#else\n\tvec4 support = tex2D(PREFIX(support_tex), vec2(tc.x * PREFIX(num_repeats), 0.0));\n        vec4 c1 = INPUT(vec2(tc.x + support.x, tc.y));\n        vec4 c2 = INPUT(vec2(tc.x + support.y, tc.y));\n#endif\n\t// Two complex additions and multiplications in parallel; essentially\n\t//\n\t//   result.xy = c1.xy + twiddle * c2.xy\n\t//   result.zw = c1.zw + twiddle * c2.zw\n\t//\n\t// where * is complex multiplication.\n\treturn c1 + support.z * c2 + support.w * vec4(-c2.y, c2.x, -c2.w, c2.z);\n}\n\n#undef DIRECTION_VERTICAL\n// A simple, circular vignette, with a cos\302\262 falloff.\n\n// Implicit uniforms:\n// uniform float PREFIX(pihalf_div_radius);\n//\n// uniform vec2 PREFIX(aspect_correction);\n// uniform vec2 PREFIX(flipped_center);\n\nvec4 FUNCNAME(vec2 tc) {\n\tvec4 x = INPUT(tc);\n\n\tconst float pihalf = 0.5 * 3.14159265358979324;\n\n\tvec2 normalized_pos = (tc - PREFIX(flipped_center)) * PREFIX(aspect_correction);\n\tfloat dist = (length(normalized_pos) - PREFIX(inner_radius)) * PREFIX(pihalf_div_radius);\n\tfloat linear_falloff = clamp(dist, 0.0, pihalf);\n\tfloat falloff = cos(linear_falloff) * cos(linear_falloff);\n\tx.rgb *= vec3(falloff);\n\n\treturn x;\n}\n// Implicit uniforms:\n// uniform float PREFIX(output_coord_to_slice_num);\n// uniform float PREFIX(slice_num_to_input_coord);\n// uniform float PREFIX(slice_offset_to_input_coord);\n// uniform float PREFIX(normalized_offset);\n \nvec4 FUNCNAME(vec2 tc) {\n\t// DIRECTION_VERTICAL will be #defined to 1 if we are expanding vertically,\n\t// and 0 otherwise.\n#if DIRECTION_VERTICAL\n\tfloat sliced_coord = 1.0 - tc.y;\n#else\n\tfloat sliced_coord = tc.x;\n#endif\n\n\t// Find out which slice we are in, and a 0..1 coordinate for the offset within that slice.\n\tfloat slice_num = floor(sliced_coord * PREFIX(output_coord_to_slice_num));\n\tfloat slice_offset = fract(sliced_coord * PREFIX(output_coord_to_slice_num));\n\n\t// Find out where this slice begins in the input data, and then offset from that.\n\tfloat input_coord = slice_num * PREFIX(slice_num_to_input_coord) + slice_offset * PREFIX(slice_offset_to_input_coord) + PREFIX(normalized_offset);\n\n#if DIRECTION_VERTICAL\n\treturn INPUT(vec2(tc.x, 1.0 - input_coord));\n#else\n\treturn INPUT(vec2(input_coord, tc.y));\n#endif\n}\n\n#undef DIRECTION_VERTICAL\n// Implicit uniforms:\n// uniform vec2 PREFIX(num_repeats);\n\nvec4 FUNCNAME(vec2 tc) {\n\tvec4 pixel = INPUT1(tc);\n\tvec2 pattern = INPUT2(tc * PREFIX(num_repeats)).xy;\n\n\t// Complex multiplication between each of (pixel.xy, pixel.zw) and pattern.xy.\n\treturn pattern.x * pixel + pattern.y * vec4(-pixel.y, pixel.x, -pixel.w, pixel.z);\n}\n// Implicit uniforms:\n// uniform float PREFIX(progress_mul_w_plus_one);\n// uniform bool PREFIX(bool_inverse);\n\nvec4 FUNCNAME(vec2 tc) {\n\tvec4 first = INPUT1(tc);\n\tvec4 second = INPUT2(tc);\n\n\t// We treat the luma as going from 0 to w, where w is the transition width\n\t// (wider means the boundary between transitioned and non-transitioned\n\t// will be harder, while w=0 is essentially just a straight fade).\n\t// We need to map this 0..w range in the luma image to a (clamped) 0..1\n\t// range for how far this pixel has come in a fade. At the very\n\t// beginning, we can visualize it like this, where every pixel is in\n\t// the state 0.0 (100% first image, 0% second image):\n\t//\n\t//         0                     w\n\t//   luma: |---------------------|\n\t//   mix:                        |----|\n\t//                               0    1\n\t//\n\t// Then as we progress, eventually the luma range should move to the right\n\t// so that more pixels start moving towards higher mix value:\n\t//\n\t//           0                     w\n\t//   luma:   |---------------------|\n\t//   mix:                        |----|\n\t//                               0    1\n\t//\n\t// and at the very end, all pixels should be in the state 1.0 (0% first image,\n\t// 100% second image):\n\t//\n\t//                                    0                     w\n\t//   luma:                            |---------------------|\n\t//   mix:                        |----|\n\t//                               0    1\n\t//\n\t// So clearly, it should move (w+1) units to the right, and apart from that\n\t// just stay a simple mapping.\n\tfloat w = PREFIX(transition_width);\n\tfloat luma = INPUT3(tc).x;\n\tif (PREFIX(bool_inverse)) {\n\t\tluma = 1.0 - luma;\n\t}\n\tfloat m = clamp((luma * w - w) + PREFIX(progress_mul_w_plus_one), 0.0, 1.0);\n\n\treturn mix(first, second, m);\n}\n// See footer.frag for details about this if statement.\n#ifndef YCBCR_ALSO_OUTPUT_RGBA\n#define YCBCR_ALSO_OUTPUT_RGBA 0\n#endif\n\n#if YCBCR_ALSO_OUTPUT_RGBA\nvec4[2] FUNCNAME(vec2 tc) {\n#else\nvec4 FUNCNAME(vec2 tc) {\n#endif\n\tvec4 rgba = INPUT(tc);\n\tvec4 ycbcr_a;\n\n\tycbcr_a.rgb = PREFIX(ycbcr_matrix) * rgba.rgb + PREFIX(offset);\n\n\tif (PREFIX(clamp_range)) {\n\t\t// If we use limited-range Y'CbCr, the card's usual 0\342\200\223255 clamping\n\t\t// won't be enough, so we need to clamp ourselves here.\n\t\t//\n\t\t// We clamp before dither, which is a bit unfortunate, since\n\t\t// it means dither can take us out of the clamped range again.\n\t\t// However, since DitherEffect never adds enough dither to change\n\t\t// the quantized levels, we will be fine in practice.\n\t\tycbcr_a.rgb = clamp(ycbcr_a.rgb, PREFIX(ycbcr_min), PREFIX(ycbcr_max));\n\t}\n\n\tycbcr_a.a = rgba.a;\n\n#if YCBCR_ALSO_OUTPUT_RGBA\n\treturn vec4[2](ycbcr_a, rgba);\n#else\n\treturn ycbcr_a;\n#endif\n}\n// Implicit uniforms:\n// uniform int PREFIX(current_field_position);\n// uniform float PREFIX(num_lines);\n// uniform float PREFIX(self_offset);\n// uniform float PREFIX(inv_width);\n// uniform float PREFIX(current_offset)[2];\n// uniform float PREFIX(other_offset)[3];\n\n// The best explanation of YADIF that I've seen is actually a pseudocode\n// reimplementation from the Doom9 forum:\n//\n//   http://forum.doom9.org/showthread.php?p=980375#post980375\n//\n// We generally follow its terminology instead of the original C source\n// (which I'll refer to as \342\200\234C YADIF\342\200\235), although I've used the C source as a\n// reference to double-check at times. We're not bit-exact the same as\n// C YADIF; in particular, we work in linear light, and left/right edge\n// handling might also be a bit different (for top/bottom edge handling,\n// C YADIF repeats texels like we do). Also, C YADIF generally works on\n// Y', Cb and Cr planes separately, while we work on the entire RGBA triplet\n// and do our spatial interpolation decisions based on the pixel as a whole,\n// so our decision metric also naturally becomes different.\n\n#define DIFF(s1, s2) dot((s1) - (s2), (s1) - (s2))\n\nvec4 FUNCNAME(vec2 tc) {\n\tint yi = int(round(tc.y * PREFIX(num_lines) - 0.5f));\n\n\t// Figure out if we just want to keep the current line or if\n\t// we need to interpolate. This branch is obviously divergent,\n\t// but the very nature of deinterlacing would seem to require that.\n\t//\n\t// Note that since we have bottom-left origin, yi % 2 will return 0\n\t// for bottom and 1 for top.\n\tif ((yi % 2) != PREFIX(current_field_position)) {\n\t\treturn INPUT3(vec2(tc.x, tc.y + PREFIX(self_offset)));\n\t}\n\n\t// First, estimate the current pixel from the neighboring pixels in the\n\t// same field (spatial interpolation). We try first 0 degrees (straight\n\t// up/down), then \302\26145 degrees and then finally \302\26163 degrees. The best of\n\t// these, as determined by the \342\200\234spatial score\342\200\235 (basically sum of squared\n\t// differences in three neighboring pixels), is kept.\n\t//\n\t// The C version of YADIF goesn't check +63\302\260 unless +45\302\260 gave an improvement,\n\t// and similarly not -63\302\260 unless -45\302\260 did. The MMX version goes through pains\n\t// to simulate the same, but notes that it \342\200\234hurts both quality and speed\342\200\235.\n\t// We're not bit-exact the same as the C version anyway, and not sampling\n\t// \302\26163\302\260 would probably be a rather divergent branch, so we just always do it.\n\n\t// a b c d e f g     \342\206\221 y\n\t//       x           |\n\t// h i j k l m n     +--> x\n\n\tvec2 a_pos = vec2(tc.x - 3.0 * PREFIX(inv_width), tc.y + PREFIX(current_offset)[1]);\n\tvec2 b_pos = vec2(tc.x - 2.0 * PREFIX(inv_width), a_pos.y);\n\tvec2 c_pos = vec2(tc.x -       PREFIX(inv_width), a_pos.y);\n\tvec2 d_pos = vec2(tc.x,                           a_pos.y);\n\tvec2 e_pos = vec2(tc.x +       PREFIX(inv_width), a_pos.y);\n\tvec2 f_pos = vec2(tc.x + 2.0 * PREFIX(inv_width), a_pos.y);\n\tvec2 g_pos = vec2(tc.x + 3.0 * PREFIX(inv_width), a_pos.y);\n\n\tvec2 h_pos = vec2(tc.x - 3.0 * PREFIX(inv_width), tc.y + PREFIX(current_offset)[0]);\n\tvec2 i_pos = vec2(tc.x - 2.0 * PREFIX(inv_width), h_pos.y);\n\tvec2 j_pos = vec2(tc.x -       PREFIX(inv_width), h_pos.y);\n\tvec2 k_pos = vec2(tc.x,                           h_pos.y);\n\tvec2 l_pos = vec2(tc.x +       PREFIX(inv_width), h_pos.y);\n\tvec2 m_pos = vec2(tc.x + 2.0 * PREFIX(inv_width), h_pos.y);\n\tvec2 n_pos = vec2(tc.x + 3.0 * PREFIX(inv_width), h_pos.y);\n\n\tvec4 a = INPUT3(a_pos);\n\tvec4 b = INPUT3(b_pos);\n\tvec4 c = INPUT3(c_pos);\n\tvec4 d = INPUT3(d_pos);\n\tvec4 e = INPUT3(e_pos);\n\tvec4 f = INPUT3(f_pos);\n\tvec4 g = INPUT3(g_pos);\n\tvec4 h = INPUT3(h_pos);\n\tvec4 i = INPUT3(i_pos);\n\tvec4 j = INPUT3(j_pos);\n\tvec4 k = INPUT3(k_pos);\n\tvec4 l = INPUT3(l_pos);\n\tvec4 m = INPUT3(m_pos);\n\tvec4 n = INPUT3(n_pos);\n\n\t// 0 degrees. Note that pred is actually twice the real spatial prediction;\n\t// we halve it later to same some arithmetic. Also, our spatial score is not\n\t// the same as in C YADIF; we use the total squared sum over all four\n\t// channels instead of deinterlacing each channel separately.\n\t//\n\t// Note that there's a small, arbitrary bonus for this first alternative,\n\t// so that vertical interpolation wins if everything else is equal.\n\tvec4 pred = d + k;\n\tfloat score;\n\tfloat best_score = DIFF(c, j) + DIFF(d, k) + DIFF(e, l) - 1e-4;\n\n\t// -45 degrees.\n\tscore = DIFF(b, k) + DIFF(c, l) + DIFF(d, m);\n\tif (score < best_score) {\n\t\tpred = c + l;\n\t\tbest_score = score;\n\t}\n\n\t// -63 degrees.\n\tscore = DIFF(a, l) + DIFF(b, m) + DIFF(c, n);\n\tif (score < best_score) {\n\t\tpred = b + m;\n\t\tbest_score = score;\n\t}\n\n\t// +45 degrees.\n\tscore = DIFF(d, i) + DIFF(e, j) + DIFF(f, k);\n\tif (score < best_score) {\n\t\tpred = e + j;\n\t\tbest_score = score;\n\t}\n\n\t// +63 degrees.\n\tscore = DIFF(e, h) + DIFF(f, i) + DIFF(g, j);\n\tif (score < best_score) {\n\t\tpred = f + i;\n\t\t// best_score isn't used anymore.\n\t}\n\n\tpred *= 0.5f;\n\n\t// Now we do a temporal prediction (p2) of this pixel based on the previous\n\t// and next fields. The spatial prediction is clamped so that it is not\n\t// too far from this temporal prediction, where \342\200\234too far\342\200\235 is based on\n\t// the amount of local temporal change. (In other words, the temporal prediction\n\t// is the safe choice, and the question is how far away from that we'll let\n\t// our spatial choice run.) Note that here, our difference metric\n\t// _is_ the same as C YADIF, namely per-channel abs.\n\t//\n\t// The sample positions look like this; in order to avoid variable name conflicts\n\t// with the spatial interpolation, we use uppercase names. x is, again,\n\t// the current pixel we're trying to estimate.\n\t//\n\t//     C   H      \342\206\221 y\n\t//   A   F   K    |\n\t//     D x I      |\n\t//   B   G   L    |\n\t//     E   J      +-----> time\n\t//\n\tvec2 AFK_pos = d_pos;\n\tvec2 BGL_pos = k_pos;\n\tvec4 A = INPUT1(AFK_pos);\n\tvec4 B = INPUT1(BGL_pos);\n\tvec4 F = d;\n\tvec4 G = k;\n\tvec4 K = INPUT5(AFK_pos);\n\tvec4 L = INPUT5(BGL_pos);\n\n\tvec2 CH_pos = vec2(tc.x, tc.y + PREFIX(other_offset)[2]);\n\tvec2 DI_pos = vec2(tc.x, tc.y + PREFIX(other_offset)[1]);\n\tvec2 EJ_pos = vec2(tc.x, tc.y + PREFIX(other_offset)[0]);\n\n\tvec4 C = INPUT2(CH_pos);\n\tvec4 D = INPUT2(DI_pos);\n\tvec4 E = INPUT2(EJ_pos);\n\n\tvec4 H = INPUT4(CH_pos);\n\tvec4 I = INPUT4(DI_pos);\n\tvec4 J = INPUT4(EJ_pos);\n\n\t// Find temporal differences around this line, using all five fields.\n\t// tdiff0 is around the current field, tdiff1 is around the previous one,\n\t// tdiff2 is around the next one.\n\tvec4 tdiff0 = abs(D - I);\n\tvec4 tdiff1 = abs(A - F) + abs(B - G);  // Actually twice tdiff1.\n\tvec4 tdiff2 = abs(K - F) + abs(L - G);  // Actually twice tdiff2.\n\tvec4 diff = max(tdiff0, 0.5f * max(tdiff1, tdiff2));\n\n\t// The following part is the spatial interlacing check, which loosens up the\n\t// allowable temporal change. (See also the comments in the .h file.)\n\t// It costs us four extra loads (C, E, H, J) and a few extra ALU ops;\n\t// we're already very load-heavy, so the extra ALU is effectively free.\n\t// It costs about 18% performance in some benchmarks, which squares\n\t// well with going from 20 to 24 loads (a 20% increase), although for\n\t// total overall performance in longer chains, the difference is nearly zero.\n\t//\n\t// The basic idea is seemingly to allow more change if there are large spatial\n\t// vertical changes, even if there are few temporal changes. These differences\n\t// are signed, though, which make it more tricky to follow, although they seem\n\t// to reduce into some sort of pseudo-abs. I will not claim to understand them\n\t// very well.\n\t//\n\t// We start by temporally interpolating the current vertical line (p0\342\200\223p4):\n\t//\n\t//     C p0 H      \342\206\221 y\n\t//   A   p1   K    |\n\t//     D p2 I      |\n\t//   B   p3   L    |\n\t//     E p4 J      +-----> time\n\t//\n\t// YADIF_ENABLE_SPATIAL_INTERLACING_CHECK will be #defined to 1\n\t// if the check is enabled. Otherwise, the compiler should\n\t// be able to remove the dependent code quite easily.\n\tvec4 p0 = 0.5f * (C + H);\n\tvec4 p1 = F;\n\tvec4 p2 = 0.5f * (D + I);\n\tvec4 p3 = G;\n\tvec4 p4 = 0.5f * (E + J);\n\n#if YADIF_ENABLE_SPATIAL_INTERLACING_CHECK\n\tvec4 max_ = max(max(p2 - p3, p2 - p1), min(p0 - p1, p4 - p3));\n\tvec4 min_ = min(min(p2 - p3, p2 - p1), max(p0 - p1, p4 - p3));\n\tdiff = max(diff, max(min_, -max_));\n#endif\n\n\treturn clamp(pred, p2 - diff, p2 + diff);\n}\n\n#undef DIFF\n#undef YADIF_ENABLE_SPATIAL_INTERLACING_CHECK\nvec4 FUNCNAME(vec2 tc) {\n\t// Your code goes here, obviously.\n\t// You can use PREFIX(parm) to access the parameter you gave in.\n\treturn INPUT(tc);\n}\n// Mirrors the image horizontally.\nvec4 FUNCNAME(vec2 tc)\n{\n\ttc.x = 1.0 - tc.x;\n\treturn INPUT(tc);\n}\nvec4 FUNCNAME(vec2 tc) {\n\treturn INPUT(tc) * PREFIX(factor);\n}\n// Implicit uniforms:\n// uniform int PREFIX(current_field_position);\n// uniform float PREFIX(inv_width);\n// uniform float PREFIX(inv_height);\n// uniform float PREFIX(current_field_vertical_offset);\n\n// Compute shader implementation of DeinterlaceEffect. See the fragment\n// shader implementation (deinterlace_effect.frag) for comments about the\n// algorithm; comments here will mainly be about issues specific to the\n// compute shader implementation.\n\n#define DIFF(s1, s2) dot((s1) - (s2), (s1) - (s2))\n\n// In input pixels (so output will be 8x32). Corresponds to get_compute_dimensions()\n// in the C++ code. It is illogical that 8x32 would be better than e.g. 32x8,\n// since we reuse more data horizontally, but especially Intel cards are much more\n// happy about this for whatever reason.\n#define GROUP_W 8\n#define GROUP_H 16\n\n// When sampling from the current field (spatial interpolation below), we have\n// a fringe of three pixels on the left and right sides, so we need to load\n// more. We also have one pixel above and below, although our destination pixel\n// is squeezed in the middle of them (they don't overlap), so we only need one\n// extra pixel.\n#define GROUP_W_FRINGE (GROUP_W + 6)\n#define GROUP_H_FRINGE (GROUP_H + 1)\n\nlayout(local_size_x = GROUP_W, local_size_y = GROUP_H) in;\n\n#if (GROUP_W_FRINGE * GROUP_H_FRINGE) > (GROUP_W * (GROUP_H + 2))\n#define TEMP_NUM_ELEM (GROUP_W_FRINGE * GROUP_H_FRINGE)\n#else\n#define TEMP_NUM_ELEM (GROUP_W * (GROUP_H + 2))\n#endif\n\nshared vec4 temp[TEMP_NUM_ELEM];\n\n#if TEMP_NUM_ELEM > (GROUP_W * GROUP_H * 2)\n#error Not enough threads to load all data in two loads\n#endif\n\n// Load a WxH block of samples. We need to do this in two phases,\n// since we have more input samples than we have output samples (threads);\n// in the second phase, some threads will be idle.\n#define LOAD_PIXEL_BLOCK(base_tc, block_width, block_height, func) \\\n{ \\\n\tmemoryBarrierShared(); \\\n\tbarrier(); \\\n\tint thread_id = int(gl_LocalInvocationID.y) * GROUP_W + int(gl_LocalInvocationID.x); \\\n\t{ \\\n\t\tint x = thread_id % (block_width); \\\n\t\tint y = thread_id / (block_width); \\\n\t\ttemp[thread_id] = func(vec2((base_tc).x + x * PREFIX(inv_width), \\\n\t\t                            (base_tc).y + y * PREFIX(inv_height))); \\\n\t} \\\n\tconst int num_threads = GROUP_W * GROUP_H; \\\n\tif (thread_id + num_threads < (block_width) * (block_height)) { \\\n\t\tint x = (thread_id + num_threads) % (block_width); \\\n\t\tint y = (thread_id + num_threads) / (block_width); \\\n\t\ttemp[thread_id + num_threads] = \\\n\t\t\tfunc(vec2((base_tc).x + x * PREFIX(inv_width), \\\n\t\t                  (base_tc).y + y * PREFIX(inv_height))); \\\n\t} \\\n\tmemoryBarrierShared(); \\\n\tbarrier(); \\\n}\n\nvoid FUNCNAME() {\n\t// The current thread is responsible for output of two pixels, namely (x,2y)\n\t// and (x,2y+1). One will be an unmodified one, the other one will be the\n\t// pixel we are trying to interpolate. If TFF (current_field_position==0),\n\t// the unmodified one is 2y+1 (remember OpenGL's bottom-left convention),\n\t// and if BFF, the unmodified one is 2y. So we need to invert current_field_position\n\t// to figure out which value to add.\n\tint yi = int(gl_GlobalInvocationID.y) * 2 + (PREFIX(current_field_position) ^ 1);\n\n\t// Load in data for the current field. current_offset signals where the block\n\t// starts vertically; see set_gl_state() in the C++ code.\n\tvec2 base_tc = vec2((gl_WorkGroupID.x * uint(GROUP_W) + (0.5f - 3.0f)) * PREFIX(inv_width),\n\t                    (gl_WorkGroupID.y * uint(GROUP_H) + 0.5f) * PREFIX(inv_height) + PREFIX(current_field_vertical_offset));\n\tLOAD_PIXEL_BLOCK(base_tc, GROUP_W_FRINGE, GROUP_H_FRINGE, INPUT3);\n\n\tint lx = int(gl_LocalInvocationID.x) + 3;\n\tint ly = int(gl_LocalInvocationID.y);\n\n\t// Output the unmodified pixel. For TFF (current_field_position == 0),\n\t// we have an extra pixel on the bottom that we're only using for interpolation\n\t// (it's being output by another workgroup), so we have to add 1.\n\tvec4 val = temp[(ly + (PREFIX(current_field_position) ^ 1)) * GROUP_W_FRINGE + lx];\n\tOUTPUT(ivec2(gl_GlobalInvocationID.x, yi), val);\n\n\t// a b c d e f g     \342\206\221 y\n\t//       x           |\n\t// h i j k l m n     +--> x\n\n\tvec4 a = temp[(ly + 1) * GROUP_W_FRINGE + lx - 3];\n\tvec4 b = temp[(ly + 1) * GROUP_W_FRINGE + lx - 2];\n\tvec4 c = temp[(ly + 1) * GROUP_W_FRINGE + lx - 1];\n\tvec4 d = temp[(ly + 1) * GROUP_W_FRINGE + lx];\n\tvec4 e = temp[(ly + 1) * GROUP_W_FRINGE + lx + 1];\n\tvec4 f = temp[(ly + 1) * GROUP_W_FRINGE + lx + 2];\n\tvec4 g = temp[(ly + 1) * GROUP_W_FRINGE + lx + 3];\n\n\tvec4 h = temp[ly * GROUP_W_FRINGE + lx - 3];\n\tvec4 i = temp[ly * GROUP_W_FRINGE + lx - 2];\n\tvec4 j = temp[ly * GROUP_W_FRINGE + lx - 1];\n\tvec4 k = temp[ly * GROUP_W_FRINGE + lx];\n\tvec4 l = temp[ly * GROUP_W_FRINGE + lx + 1];\n\tvec4 m = temp[ly * GROUP_W_FRINGE + lx + 2];\n\tvec4 n = temp[ly * GROUP_W_FRINGE + lx + 3];\n\n\t// 0 degrees.\n\tvec4 pred = d + k;\n\tfloat score;\n\tfloat best_score = DIFF(c, j) + DIFF(d, k) + DIFF(e, l) - 1e-4;\n\n\t// -45 degrees.\n\tscore = DIFF(b, k) + DIFF(c, l) + DIFF(d, m);\n\tif (score < best_score) {\n\t\tpred = c + l;\n\t\tbest_score = score;\n\t}\n\n\t// -63 degrees.\n\tscore = DIFF(a, l) + DIFF(b, m) + DIFF(c, n);\n\tif (score < best_score) {\n\t\tpred = b + m;\n\t\tbest_score = score;\n\t}\n\n\t// +45 degrees.\n\tscore = DIFF(d, i) + DIFF(e, j) + DIFF(f, k);\n\tif (score < best_score) {\n\t\tpred = e + j;\n\t\tbest_score = score;\n\t}\n\n\t// +63 degrees.\n\tscore = DIFF(e, h) + DIFF(f, i) + DIFF(g, j);\n\tif (score < best_score) {\n\t\tpred = f + i;\n\t\t// best_score isn't used anymore.\n\t}\n\n\tpred *= 0.5f;\n\n\t// Temporal prediction (p2) of this pixel based on the previous and next fields.\n\t//\n\t//                \342\206\221 y\n\t//     C   H      |\n\t//   A   F   K    |\n\t//     D x I      |\n\t//   B   G   L    |\n\t//     E   J      |\n\t//                +-----> time\n\t//\n\t// x is obviously aligned with D and I, so we don't need texcoord\n\t// adjustment for top/bottom field here, unlike earlier. However, we need\n\t// to start the block one pixel below since we need E/J, thus the -1 in\n\t// the y coordinate.\n\tbase_tc = vec2((gl_WorkGroupID.x * uint(GROUP_W) + 0.5f) * PREFIX(inv_width),\n\t               (gl_WorkGroupID.y * uint(GROUP_H) + (0.5f - 1.0f)) * PREFIX(inv_height));\n\tlx = int(gl_LocalInvocationID.x);\n#if YADIF_ENABLE_SPATIAL_INTERLACING_CHECK\n\tLOAD_PIXEL_BLOCK(base_tc, GROUP_W, GROUP_H + 2, INPUT2);\n\tvec4 C = temp[(ly + 2) * GROUP_W + lx];\n\tvec4 D = temp[(ly + 1) * GROUP_W + lx];\n\tvec4 E = temp[ ly      * GROUP_W + lx];\n\n\tLOAD_PIXEL_BLOCK(base_tc, GROUP_W, GROUP_H + 2, INPUT4);\n\tvec4 H = temp[(ly + 2) * GROUP_W + lx];\n\tvec4 I = temp[(ly + 1) * GROUP_W + lx];\n\tvec4 J = temp[ ly      * GROUP_W + lx];\n#else\n\t// Since spatial interlacing check is not enabled, we only need D\n\t// and I from the previous and next fields; since they are not shared\n\t// between the neighboring pixels, they can be straight-up loads.\n\tvec2 DI_pos = vec2((gl_GlobalInvocationID.x + 0.5f) * PREFIX(inv_width),\n\t                   (gl_GlobalInvocationID.y + 0.5f) * PREFIX(inv_height));\n\tvec4 D = INPUT2(DI_pos);\n\tvec4 I = INPUT4(DI_pos);\n#endif\n\n\t// Load what we need from the previous field into shared memory,\n\t// since A/B can be reused between neighboring pixels. We need one\n\t// line above/below, but we don't need the horizontal fringe.\n\tLOAD_PIXEL_BLOCK(base_tc, GROUP_W, GROUP_H + 1, INPUT1);\n\tvec4 A = temp[(ly + 1) * GROUP_W + lx];\n\tvec4 B = temp[ ly      * GROUP_W + lx];\n\n\t// What we need from the current field was loaded earlier.\n\tvec4 F = d;\n\tvec4 G = k;\n\n\t// Next field.\n\tLOAD_PIXEL_BLOCK(base_tc, GROUP_W, GROUP_H + 1, INPUT5);\n\tvec4 K = temp[(ly + 1) * GROUP_W + lx];\n\tvec4 L = temp[ ly      * GROUP_W + lx];\n\n\t// Find temporal differences around this line.\n\tvec4 tdiff0 = abs(D - I);\n\tvec4 tdiff1 = abs(A - F) + abs(B - G);  // Actually twice tdiff1.\n\tvec4 tdiff2 = abs(K - F) + abs(L - G);  // Actually twice tdiff2.\n\tvec4 diff = max(tdiff0, 0.5f * max(tdiff1, tdiff2));\n\n#if YADIF_ENABLE_SPATIAL_INTERLACING_CHECK\n\t// Spatial interlacing check.\n\t// We start by temporally interpolating the current vertical line (p0\342\200\223p4):\n\t//\n\t//     C p0 H      \342\206\221 y\n\t//       p1        |\n\t//     D p2 I      |\n\t//       p3        |\n\t//     E p4 J      +-----> time\n\t//\n\tvec4 p0 = 0.5f * (C + H);\n\tvec4 p1 = F;\n\tvec4 p2 = 0.5f * (D + I);\n\tvec4 p3 = G;\n\tvec4 p4 = 0.5f * (E + J);\n\n\tvec4 max_ = max(max(p2 - p3, p2 - p1), min(p0 - p1, p4 - p3));\n\tvec4 min_ = min(min(p2 - p3, p2 - p1), max(p0 - p1, p4 - p3));\n\tdiff = max(diff, max(min_, -max_));\n#else\n\tvec4 p2 = 0.5f * (D + I);\n#endif\n\n\tval = clamp(pred, p2 - diff, p2 + diff);\n\tOUTPUT(ivec2(gl_GlobalInvocationID.x, yi ^ 1), val);\n}\n\n#undef LOAD_PIXEL_BLOCK\n#undef DIFF\n#undef YADIF_ENABLE_SPATIAL_INTERLACING_CHECK\nvec4 FUNCNAME(vec2 tc) {\n\treturn max(INPUT(tc) - vec4(PREFIX(cutoff)), 0.0);\n}\nvec4 FUNCNAME(vec2 tc) {\n\tvec4 orig = INPUT1(tc);\n\tvec4 blurred = INPUT2(tc);\n\tfloat luminance = clamp(dot(orig.rgb, vec3(0.2126, 0.7152, 0.0722)), 0.0, 1.0);\n\treturn mix(orig, blurred, luminance * vec4(PREFIX(blurred_mix_amount)));\n}\n";

}  // namespace movit