/* This file is autogenerated by u_format_table.py from u_format.csv. Do not edit directly. */

/**************************************************************************
 *
 * Copyright 2010 VMware, Inc.
 * All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sub license, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice (including the
 * next paragraph) shall be included in all copies or substantial portions
 * of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *
 **************************************************************************/

#include "util/format/u_format.h"
#include "u_format_bptc.h"
#include "u_format_fxt1.h"
#include "u_format_s3tc.h"
#include "u_format_rgtc.h"
#include "u_format_latc.h"
#include "u_format_etc.h"


#include "pipe/p_compiler.h"
#include "util/u_math.h"
#include "util/half_float.h"
#include "u_format.h"
#include "u_format_other.h"
#include "util/format_srgb.h"
#include "format_utils.h"
#include "u_format_yuv.h"
#include "u_format_zs.h"
#include "u_format_pack.h"

void
util_format_none_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         uint8_t r = value;
         dst[0] = (float)r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
      src += 1;
      dst += 4;
   }
}

void
util_format_none_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= (uint8_t)CLAMP(src[0], 0.0f, 255.0f);
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_none_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
         uint8_t value = *(const uint8_t *)src;
         uint8_t r = value;
         dst[0] = (float)r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
}

void
util_format_none_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         uint8_t r = value;
         dst[0] = (uint8_t)(((uint32_t)MIN2(r, 1)) * 0xff / 0x1); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
      src += 1;
      dst += 4;
   }
}

void
util_format_none_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= (uint8_t)(((uint32_t)src[0]) * 0x1 / 0xff);
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b8g8r8a8_unorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = value >> 24;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t r = (value >> 8) & 0xff;
         uint32_t a = (value) & 0xff;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = (value) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t r = (value >> 16) & 0xff;
         uint32_t a = value >> 24;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_b8g8r8a8_unorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(float_to_ubyte(src[2])) << 24;
         value |= (uint32_t)((float_to_ubyte(src[1])) & 0xff) << 16;
         value |= (uint32_t)((float_to_ubyte(src[0])) & 0xff) << 8;
         value |= (float_to_ubyte(src[3])) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (float_to_ubyte(src[2])) & 0xff;
         value |= (uint32_t)((float_to_ubyte(src[1])) & 0xff) << 8;
         value |= (uint32_t)((float_to_ubyte(src[0])) & 0xff) << 16;
         value |= (uint32_t)(float_to_ubyte(src[3])) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b8g8r8a8_unorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = value >> 24;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t r = (value >> 8) & 0xff;
         uint32_t a = (value) & 0xff;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = (value) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t r = (value >> 16) & 0xff;
         uint32_t a = value >> 24;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#endif
}

void
util_format_b8g8r8a8_unorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = value >> 24;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t r = (value >> 8) & 0xff;
         uint32_t a = (value) & 0xff;
         dst[0] = r; /* r */
         dst[1] = g; /* g */
         dst[2] = b; /* b */
         dst[3] = a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = (value) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t r = (value >> 16) & 0xff;
         uint32_t a = value >> 24;
         dst[0] = r; /* r */
         dst[1] = g; /* g */
         dst[2] = b; /* b */
         dst[3] = a; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_b8g8r8a8_unorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(src[2]) << 24;
         value |= (uint32_t)((src[1]) & 0xff) << 16;
         value |= (uint32_t)((src[0]) & 0xff) << 8;
         value |= (src[3]) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (src[2]) & 0xff;
         value |= (uint32_t)((src[1]) & 0xff) << 8;
         value |= (uint32_t)((src[0]) & 0xff) << 16;
         value |= (uint32_t)(src[3]) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b8g8r8x8_unorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = value >> 24;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t r = (value >> 8) & 0xff;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = (value) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t r = (value >> 16) & 0xff;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = 1; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_b8g8r8x8_unorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(float_to_ubyte(src[2])) << 24;
         value |= (uint32_t)((float_to_ubyte(src[1])) & 0xff) << 16;
         value |= (uint32_t)((float_to_ubyte(src[0])) & 0xff) << 8;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (float_to_ubyte(src[2])) & 0xff;
         value |= (uint32_t)((float_to_ubyte(src[1])) & 0xff) << 8;
         value |= (uint32_t)((float_to_ubyte(src[0])) & 0xff) << 16;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b8g8r8x8_unorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = value >> 24;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t r = (value >> 8) & 0xff;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = (value) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t r = (value >> 16) & 0xff;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_b8g8r8x8_unorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = value >> 24;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t r = (value >> 8) & 0xff;
         dst[0] = r; /* r */
         dst[1] = g; /* g */
         dst[2] = b; /* b */
         dst[3] = 255; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = (value) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t r = (value >> 16) & 0xff;
         dst[0] = r; /* r */
         dst[1] = g; /* g */
         dst[2] = b; /* b */
         dst[3] = 255; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_b8g8r8x8_unorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(src[2]) << 24;
         value |= (uint32_t)((src[1]) & 0xff) << 16;
         value |= (uint32_t)((src[0]) & 0xff) << 8;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (src[2]) & 0xff;
         value |= (uint32_t)((src[1]) & 0xff) << 8;
         value |= (uint32_t)((src[0]) & 0xff) << 16;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b8g8r8x8_snorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t b = ((int32_t)(value) ) >> 24;
         int32_t g = ((int32_t)(value << 8) ) >> 24;
         int32_t r = ((int32_t)(value << 16) ) >> 24;
         dst[0] = (float)(r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(g * (1.0f/0x7f)); /* g */
         dst[2] = (float)(b * (1.0f/0x7f)); /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t b = ((int32_t)(value << 24) ) >> 24;
         int32_t g = ((int32_t)(value << 16) ) >> 24;
         int32_t r = ((int32_t)(value << 8) ) >> 24;
         dst[0] = (float)(r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(g * (1.0f/0x7f)); /* g */
         dst[2] = (float)(b * (1.0f/0x7f)); /* b */
         dst[3] = 1; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_b8g8r8x8_snorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((int8_t)util_iround(CLAMP(src[2], -1.0f, 1.0f) * 0x7f)) << 24) ;
         value |= (uint32_t)((uint32_t)(((int8_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0x7f)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)(((int8_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7f)) & 0xff) << 8) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((int8_t)util_iround(CLAMP(src[2], -1.0f, 1.0f) * 0x7f)) & 0xff) ;
         value |= (uint32_t)((uint32_t)(((int8_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0x7f)) & 0xff) << 8) ;
         value |= (uint32_t)((uint32_t)(((int8_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7f)) & 0xff) << 16) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b8g8r8x8_snorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t b = ((int32_t)(value) ) >> 24;
         int32_t g = ((int32_t)(value << 8) ) >> 24;
         int32_t r = ((int32_t)(value << 16) ) >> 24;
         dst[0] = (float)(r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(g * (1.0f/0x7f)); /* g */
         dst[2] = (float)(b * (1.0f/0x7f)); /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t b = ((int32_t)(value << 24) ) >> 24;
         int32_t g = ((int32_t)(value << 16) ) >> 24;
         int32_t r = ((int32_t)(value << 8) ) >> 24;
         dst[0] = (float)(r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(g * (1.0f/0x7f)); /* g */
         dst[2] = (float)(b * (1.0f/0x7f)); /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_b8g8r8x8_snorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t b = ((int32_t)(value) ) >> 24;
         int32_t g = ((int32_t)(value << 8) ) >> 24;
         int32_t r = ((int32_t)(value << 16) ) >> 24;
         dst[0] = _mesa_snorm_to_unorm(MAX2(r, 0), 8, 8); /* r */
         dst[1] = _mesa_snorm_to_unorm(MAX2(g, 0), 8, 8); /* g */
         dst[2] = _mesa_snorm_to_unorm(MAX2(b, 0), 8, 8); /* b */
         dst[3] = 255; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t b = ((int32_t)(value << 24) ) >> 24;
         int32_t g = ((int32_t)(value << 16) ) >> 24;
         int32_t r = ((int32_t)(value << 8) ) >> 24;
         dst[0] = _mesa_snorm_to_unorm(MAX2(r, 0), 8, 8); /* r */
         dst[1] = _mesa_snorm_to_unorm(MAX2(g, 0), 8, 8); /* g */
         dst[2] = _mesa_snorm_to_unorm(MAX2(b, 0), 8, 8); /* b */
         dst[3] = 255; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_b8g8r8x8_snorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)(_mesa_unorm_to_snorm(src[2], 8, 8)) << 24) ;
         value |= (uint32_t)((uint32_t)((_mesa_unorm_to_snorm(src[1], 8, 8)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)((_mesa_unorm_to_snorm(src[0], 8, 8)) & 0xff) << 8) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)((_mesa_unorm_to_snorm(src[2], 8, 8)) & 0xff) ;
         value |= (uint32_t)((uint32_t)((_mesa_unorm_to_snorm(src[1], 8, 8)) & 0xff) << 8) ;
         value |= (uint32_t)((uint32_t)((_mesa_unorm_to_snorm(src[0], 8, 8)) & 0xff) << 16) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b8g8r8x8_uint_unpack_unsigned(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   unsigned *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = value >> 24;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t r = (value >> 8) & 0xff;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = (value) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t r = (value >> 16) & 0xff;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = 1; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_b8g8r8x8_uint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint8_t)MIN2(src[2], 255)) << 24;
         value |= (uint32_t)(((uint8_t)MIN2(src[1], 255)) & 0xff) << 16;
         value |= (uint32_t)(((uint8_t)MIN2(src[0], 255)) & 0xff) << 8;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint8_t)MIN2(src[2], 255)) & 0xff;
         value |= (uint32_t)(((uint8_t)MIN2(src[1], 255)) & 0xff) << 8;
         value |= (uint32_t)(((uint8_t)MIN2(src[0], 255)) & 0xff) << 16;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b8g8r8x8_uint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   unsigned *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = value >> 24;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t r = (value >> 8) & 0xff;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = (value) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t r = (value >> 16) & 0xff;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_b8g8r8x8_uint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint8_t)CLAMP(src[2], 0, 255)) << 24;
         value |= (uint32_t)(((uint8_t)CLAMP(src[1], 0, 255)) & 0xff) << 16;
         value |= (uint32_t)(((uint8_t)CLAMP(src[0], 0, 255)) & 0xff) << 8;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint8_t)CLAMP(src[2], 0, 255)) & 0xff;
         value |= (uint32_t)(((uint8_t)CLAMP(src[1], 0, 255)) & 0xff) << 8;
         value |= (uint32_t)(((uint8_t)CLAMP(src[0], 0, 255)) & 0xff) << 16;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b8g8r8x8_sint_unpack_signed(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   int *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t b = ((int32_t)(value) ) >> 24;
         int32_t g = ((int32_t)(value << 8) ) >> 24;
         int32_t r = ((int32_t)(value << 16) ) >> 24;
         dst[0] = (int)r; /* r */
         dst[1] = (int)g; /* g */
         dst[2] = (int)b; /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t b = ((int32_t)(value << 24) ) >> 24;
         int32_t g = ((int32_t)(value << 16) ) >> 24;
         int32_t r = ((int32_t)(value << 8) ) >> 24;
         dst[0] = (int)r; /* r */
         dst[1] = (int)g; /* g */
         dst[2] = (int)b; /* b */
         dst[3] = 1; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_b8g8r8x8_sint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((int8_t)CLAMP(src[2], -128, 127)) << 24) ;
         value |= (uint32_t)((uint32_t)(((int8_t)CLAMP(src[1], -128, 127)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)(((int8_t)CLAMP(src[0], -128, 127)) & 0xff) << 8) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((int8_t)CLAMP(src[2], -128, 127)) & 0xff) ;
         value |= (uint32_t)((uint32_t)(((int8_t)CLAMP(src[1], -128, 127)) & 0xff) << 8) ;
         value |= (uint32_t)((uint32_t)(((int8_t)CLAMP(src[0], -128, 127)) & 0xff) << 16) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b8g8r8x8_sint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   int *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t b = ((int32_t)(value) ) >> 24;
         int32_t g = ((int32_t)(value << 8) ) >> 24;
         int32_t r = ((int32_t)(value << 16) ) >> 24;
         dst[0] = (int)r; /* r */
         dst[1] = (int)g; /* g */
         dst[2] = (int)b; /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t b = ((int32_t)(value << 24) ) >> 24;
         int32_t g = ((int32_t)(value << 16) ) >> 24;
         int32_t r = ((int32_t)(value << 8) ) >> 24;
         dst[0] = (int)r; /* r */
         dst[1] = (int)g; /* g */
         dst[2] = (int)b; /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_b8g8r8x8_sint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((int8_t)MIN2(src[2], 127)) << 24) ;
         value |= (uint32_t)((uint32_t)(((int8_t)MIN2(src[1], 127)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)(((int8_t)MIN2(src[0], 127)) & 0xff) << 8) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((int8_t)MIN2(src[2], 127)) & 0xff) ;
         value |= (uint32_t)((uint32_t)(((int8_t)MIN2(src[1], 127)) & 0xff) << 8) ;
         value |= (uint32_t)((uint32_t)(((int8_t)MIN2(src[0], 127)) & 0xff) << 16) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a8r8g8b8_unorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = value >> 24;
         uint32_t r = (value >> 16) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t b = (value) & 0xff;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = (value) & 0xff;
         uint32_t r = (value >> 8) & 0xff;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t b = value >> 24;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_a8r8g8b8_unorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(float_to_ubyte(src[3])) << 24;
         value |= (uint32_t)((float_to_ubyte(src[0])) & 0xff) << 16;
         value |= (uint32_t)((float_to_ubyte(src[1])) & 0xff) << 8;
         value |= (float_to_ubyte(src[2])) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (float_to_ubyte(src[3])) & 0xff;
         value |= (uint32_t)((float_to_ubyte(src[0])) & 0xff) << 8;
         value |= (uint32_t)((float_to_ubyte(src[1])) & 0xff) << 16;
         value |= (uint32_t)(float_to_ubyte(src[2])) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a8r8g8b8_unorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = value >> 24;
         uint32_t r = (value >> 16) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t b = (value) & 0xff;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = (value) & 0xff;
         uint32_t r = (value >> 8) & 0xff;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t b = value >> 24;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#endif
}

void
util_format_a8r8g8b8_unorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = value >> 24;
         uint32_t r = (value >> 16) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t b = (value) & 0xff;
         dst[0] = r; /* r */
         dst[1] = g; /* g */
         dst[2] = b; /* b */
         dst[3] = a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = (value) & 0xff;
         uint32_t r = (value >> 8) & 0xff;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t b = value >> 24;
         dst[0] = r; /* r */
         dst[1] = g; /* g */
         dst[2] = b; /* b */
         dst[3] = a; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_a8r8g8b8_unorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(src[3]) << 24;
         value |= (uint32_t)((src[0]) & 0xff) << 16;
         value |= (uint32_t)((src[1]) & 0xff) << 8;
         value |= (src[2]) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (src[3]) & 0xff;
         value |= (uint32_t)((src[0]) & 0xff) << 8;
         value |= (uint32_t)((src[1]) & 0xff) << 16;
         value |= (uint32_t)(src[2]) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a8r8g8b8_snorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t a = ((int32_t)(value) ) >> 24;
         int32_t r = ((int32_t)(value << 8) ) >> 24;
         int32_t g = ((int32_t)(value << 16) ) >> 24;
         int32_t b = ((int32_t)(value << 24) ) >> 24;
         dst[0] = (float)(r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(g * (1.0f/0x7f)); /* g */
         dst[2] = (float)(b * (1.0f/0x7f)); /* b */
         dst[3] = (float)(a * (1.0f/0x7f)); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t a = ((int32_t)(value << 24) ) >> 24;
         int32_t r = ((int32_t)(value << 16) ) >> 24;
         int32_t g = ((int32_t)(value << 8) ) >> 24;
         int32_t b = ((int32_t)(value) ) >> 24;
         dst[0] = (float)(r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(g * (1.0f/0x7f)); /* g */
         dst[2] = (float)(b * (1.0f/0x7f)); /* b */
         dst[3] = (float)(a * (1.0f/0x7f)); /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_a8r8g8b8_snorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((int8_t)util_iround(CLAMP(src[3], -1.0f, 1.0f) * 0x7f)) << 24) ;
         value |= (uint32_t)((uint32_t)(((int8_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7f)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)(((int8_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0x7f)) & 0xff) << 8) ;
         value |= (uint32_t)(((int8_t)util_iround(CLAMP(src[2], -1.0f, 1.0f) * 0x7f)) & 0xff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((int8_t)util_iround(CLAMP(src[3], -1.0f, 1.0f) * 0x7f)) & 0xff) ;
         value |= (uint32_t)((uint32_t)(((int8_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7f)) & 0xff) << 8) ;
         value |= (uint32_t)((uint32_t)(((int8_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0x7f)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)((int8_t)util_iround(CLAMP(src[2], -1.0f, 1.0f) * 0x7f)) << 24) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a8r8g8b8_snorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t a = ((int32_t)(value) ) >> 24;
         int32_t r = ((int32_t)(value << 8) ) >> 24;
         int32_t g = ((int32_t)(value << 16) ) >> 24;
         int32_t b = ((int32_t)(value << 24) ) >> 24;
         dst[0] = (float)(r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(g * (1.0f/0x7f)); /* g */
         dst[2] = (float)(b * (1.0f/0x7f)); /* b */
         dst[3] = (float)(a * (1.0f/0x7f)); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t a = ((int32_t)(value << 24) ) >> 24;
         int32_t r = ((int32_t)(value << 16) ) >> 24;
         int32_t g = ((int32_t)(value << 8) ) >> 24;
         int32_t b = ((int32_t)(value) ) >> 24;
         dst[0] = (float)(r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(g * (1.0f/0x7f)); /* g */
         dst[2] = (float)(b * (1.0f/0x7f)); /* b */
         dst[3] = (float)(a * (1.0f/0x7f)); /* a */
#endif
}

void
util_format_a8r8g8b8_snorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t a = ((int32_t)(value) ) >> 24;
         int32_t r = ((int32_t)(value << 8) ) >> 24;
         int32_t g = ((int32_t)(value << 16) ) >> 24;
         int32_t b = ((int32_t)(value << 24) ) >> 24;
         dst[0] = _mesa_snorm_to_unorm(MAX2(r, 0), 8, 8); /* r */
         dst[1] = _mesa_snorm_to_unorm(MAX2(g, 0), 8, 8); /* g */
         dst[2] = _mesa_snorm_to_unorm(MAX2(b, 0), 8, 8); /* b */
         dst[3] = _mesa_snorm_to_unorm(MAX2(a, 0), 8, 8); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t a = ((int32_t)(value << 24) ) >> 24;
         int32_t r = ((int32_t)(value << 16) ) >> 24;
         int32_t g = ((int32_t)(value << 8) ) >> 24;
         int32_t b = ((int32_t)(value) ) >> 24;
         dst[0] = _mesa_snorm_to_unorm(MAX2(r, 0), 8, 8); /* r */
         dst[1] = _mesa_snorm_to_unorm(MAX2(g, 0), 8, 8); /* g */
         dst[2] = _mesa_snorm_to_unorm(MAX2(b, 0), 8, 8); /* b */
         dst[3] = _mesa_snorm_to_unorm(MAX2(a, 0), 8, 8); /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_a8r8g8b8_snorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)(_mesa_unorm_to_snorm(src[3], 8, 8)) << 24) ;
         value |= (uint32_t)((uint32_t)((_mesa_unorm_to_snorm(src[0], 8, 8)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)((_mesa_unorm_to_snorm(src[1], 8, 8)) & 0xff) << 8) ;
         value |= (uint32_t)((_mesa_unorm_to_snorm(src[2], 8, 8)) & 0xff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)((_mesa_unorm_to_snorm(src[3], 8, 8)) & 0xff) ;
         value |= (uint32_t)((uint32_t)((_mesa_unorm_to_snorm(src[0], 8, 8)) & 0xff) << 8) ;
         value |= (uint32_t)((uint32_t)((_mesa_unorm_to_snorm(src[1], 8, 8)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)(_mesa_unorm_to_snorm(src[2], 8, 8)) << 24) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a8r8g8b8_sint_unpack_signed(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   int *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t a = ((int32_t)(value) ) >> 24;
         int32_t r = ((int32_t)(value << 8) ) >> 24;
         int32_t g = ((int32_t)(value << 16) ) >> 24;
         int32_t b = ((int32_t)(value << 24) ) >> 24;
         dst[0] = (int)r; /* r */
         dst[1] = (int)g; /* g */
         dst[2] = (int)b; /* b */
         dst[3] = (int)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t a = ((int32_t)(value << 24) ) >> 24;
         int32_t r = ((int32_t)(value << 16) ) >> 24;
         int32_t g = ((int32_t)(value << 8) ) >> 24;
         int32_t b = ((int32_t)(value) ) >> 24;
         dst[0] = (int)r; /* r */
         dst[1] = (int)g; /* g */
         dst[2] = (int)b; /* b */
         dst[3] = (int)a; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_a8r8g8b8_sint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((int8_t)CLAMP(src[3], -128, 127)) << 24) ;
         value |= (uint32_t)((uint32_t)(((int8_t)CLAMP(src[0], -128, 127)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)(((int8_t)CLAMP(src[1], -128, 127)) & 0xff) << 8) ;
         value |= (uint32_t)(((int8_t)CLAMP(src[2], -128, 127)) & 0xff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((int8_t)CLAMP(src[3], -128, 127)) & 0xff) ;
         value |= (uint32_t)((uint32_t)(((int8_t)CLAMP(src[0], -128, 127)) & 0xff) << 8) ;
         value |= (uint32_t)((uint32_t)(((int8_t)CLAMP(src[1], -128, 127)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)((int8_t)CLAMP(src[2], -128, 127)) << 24) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a8r8g8b8_sint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   int *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t a = ((int32_t)(value) ) >> 24;
         int32_t r = ((int32_t)(value << 8) ) >> 24;
         int32_t g = ((int32_t)(value << 16) ) >> 24;
         int32_t b = ((int32_t)(value << 24) ) >> 24;
         dst[0] = (int)r; /* r */
         dst[1] = (int)g; /* g */
         dst[2] = (int)b; /* b */
         dst[3] = (int)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t a = ((int32_t)(value << 24) ) >> 24;
         int32_t r = ((int32_t)(value << 16) ) >> 24;
         int32_t g = ((int32_t)(value << 8) ) >> 24;
         int32_t b = ((int32_t)(value) ) >> 24;
         dst[0] = (int)r; /* r */
         dst[1] = (int)g; /* g */
         dst[2] = (int)b; /* b */
         dst[3] = (int)a; /* a */
#endif
}

void
util_format_a8r8g8b8_sint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((int8_t)MIN2(src[3], 127)) << 24) ;
         value |= (uint32_t)((uint32_t)(((int8_t)MIN2(src[0], 127)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)(((int8_t)MIN2(src[1], 127)) & 0xff) << 8) ;
         value |= (uint32_t)(((int8_t)MIN2(src[2], 127)) & 0xff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((int8_t)MIN2(src[3], 127)) & 0xff) ;
         value |= (uint32_t)((uint32_t)(((int8_t)MIN2(src[0], 127)) & 0xff) << 8) ;
         value |= (uint32_t)((uint32_t)(((int8_t)MIN2(src[1], 127)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)((int8_t)MIN2(src[2], 127)) << 24) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_x8r8g8b8_unorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = (value >> 16) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t b = (value) & 0xff;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = (value >> 8) & 0xff;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t b = value >> 24;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = 1; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_x8r8g8b8_unorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((float_to_ubyte(src[0])) & 0xff) << 16;
         value |= (uint32_t)((float_to_ubyte(src[1])) & 0xff) << 8;
         value |= (float_to_ubyte(src[2])) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)((float_to_ubyte(src[0])) & 0xff) << 8;
         value |= (uint32_t)((float_to_ubyte(src[1])) & 0xff) << 16;
         value |= (uint32_t)(float_to_ubyte(src[2])) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_x8r8g8b8_unorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = (value >> 16) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t b = (value) & 0xff;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = (value >> 8) & 0xff;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t b = value >> 24;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_x8r8g8b8_unorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = (value >> 16) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t b = (value) & 0xff;
         dst[0] = r; /* r */
         dst[1] = g; /* g */
         dst[2] = b; /* b */
         dst[3] = 255; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = (value >> 8) & 0xff;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t b = value >> 24;
         dst[0] = r; /* r */
         dst[1] = g; /* g */
         dst[2] = b; /* b */
         dst[3] = 255; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_x8r8g8b8_unorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((src[0]) & 0xff) << 16;
         value |= (uint32_t)((src[1]) & 0xff) << 8;
         value |= (src[2]) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)((src[0]) & 0xff) << 8;
         value |= (uint32_t)((src[1]) & 0xff) << 16;
         value |= (uint32_t)(src[2]) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_x8r8g8b8_snorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t r = ((int32_t)(value << 8) ) >> 24;
         int32_t g = ((int32_t)(value << 16) ) >> 24;
         int32_t b = ((int32_t)(value << 24) ) >> 24;
         dst[0] = (float)(r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(g * (1.0f/0x7f)); /* g */
         dst[2] = (float)(b * (1.0f/0x7f)); /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r = ((int32_t)(value << 16) ) >> 24;
         int32_t g = ((int32_t)(value << 8) ) >> 24;
         int32_t b = ((int32_t)(value) ) >> 24;
         dst[0] = (float)(r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(g * (1.0f/0x7f)); /* g */
         dst[2] = (float)(b * (1.0f/0x7f)); /* b */
         dst[3] = 1; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_x8r8g8b8_snorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)(((int8_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7f)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)(((int8_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0x7f)) & 0xff) << 8) ;
         value |= (uint32_t)(((int8_t)util_iround(CLAMP(src[2], -1.0f, 1.0f) * 0x7f)) & 0xff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)(((int8_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7f)) & 0xff) << 8) ;
         value |= (uint32_t)((uint32_t)(((int8_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0x7f)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)((int8_t)util_iround(CLAMP(src[2], -1.0f, 1.0f) * 0x7f)) << 24) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_x8r8g8b8_snorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t r = ((int32_t)(value << 8) ) >> 24;
         int32_t g = ((int32_t)(value << 16) ) >> 24;
         int32_t b = ((int32_t)(value << 24) ) >> 24;
         dst[0] = (float)(r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(g * (1.0f/0x7f)); /* g */
         dst[2] = (float)(b * (1.0f/0x7f)); /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r = ((int32_t)(value << 16) ) >> 24;
         int32_t g = ((int32_t)(value << 8) ) >> 24;
         int32_t b = ((int32_t)(value) ) >> 24;
         dst[0] = (float)(r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(g * (1.0f/0x7f)); /* g */
         dst[2] = (float)(b * (1.0f/0x7f)); /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_x8r8g8b8_snorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t r = ((int32_t)(value << 8) ) >> 24;
         int32_t g = ((int32_t)(value << 16) ) >> 24;
         int32_t b = ((int32_t)(value << 24) ) >> 24;
         dst[0] = _mesa_snorm_to_unorm(MAX2(r, 0), 8, 8); /* r */
         dst[1] = _mesa_snorm_to_unorm(MAX2(g, 0), 8, 8); /* g */
         dst[2] = _mesa_snorm_to_unorm(MAX2(b, 0), 8, 8); /* b */
         dst[3] = 255; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r = ((int32_t)(value << 16) ) >> 24;
         int32_t g = ((int32_t)(value << 8) ) >> 24;
         int32_t b = ((int32_t)(value) ) >> 24;
         dst[0] = _mesa_snorm_to_unorm(MAX2(r, 0), 8, 8); /* r */
         dst[1] = _mesa_snorm_to_unorm(MAX2(g, 0), 8, 8); /* g */
         dst[2] = _mesa_snorm_to_unorm(MAX2(b, 0), 8, 8); /* b */
         dst[3] = 255; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_x8r8g8b8_snorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((_mesa_unorm_to_snorm(src[0], 8, 8)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)((_mesa_unorm_to_snorm(src[1], 8, 8)) & 0xff) << 8) ;
         value |= (uint32_t)((_mesa_unorm_to_snorm(src[2], 8, 8)) & 0xff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((_mesa_unorm_to_snorm(src[0], 8, 8)) & 0xff) << 8) ;
         value |= (uint32_t)((uint32_t)((_mesa_unorm_to_snorm(src[1], 8, 8)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)(_mesa_unorm_to_snorm(src[2], 8, 8)) << 24) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_x8r8g8b8_sint_unpack_signed(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   int *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t r = ((int32_t)(value << 8) ) >> 24;
         int32_t g = ((int32_t)(value << 16) ) >> 24;
         int32_t b = ((int32_t)(value << 24) ) >> 24;
         dst[0] = (int)r; /* r */
         dst[1] = (int)g; /* g */
         dst[2] = (int)b; /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r = ((int32_t)(value << 16) ) >> 24;
         int32_t g = ((int32_t)(value << 8) ) >> 24;
         int32_t b = ((int32_t)(value) ) >> 24;
         dst[0] = (int)r; /* r */
         dst[1] = (int)g; /* g */
         dst[2] = (int)b; /* b */
         dst[3] = 1; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_x8r8g8b8_sint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)(((int8_t)CLAMP(src[0], -128, 127)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)(((int8_t)CLAMP(src[1], -128, 127)) & 0xff) << 8) ;
         value |= (uint32_t)(((int8_t)CLAMP(src[2], -128, 127)) & 0xff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)(((int8_t)CLAMP(src[0], -128, 127)) & 0xff) << 8) ;
         value |= (uint32_t)((uint32_t)(((int8_t)CLAMP(src[1], -128, 127)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)((int8_t)CLAMP(src[2], -128, 127)) << 24) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_x8r8g8b8_sint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   int *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t r = ((int32_t)(value << 8) ) >> 24;
         int32_t g = ((int32_t)(value << 16) ) >> 24;
         int32_t b = ((int32_t)(value << 24) ) >> 24;
         dst[0] = (int)r; /* r */
         dst[1] = (int)g; /* g */
         dst[2] = (int)b; /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r = ((int32_t)(value << 16) ) >> 24;
         int32_t g = ((int32_t)(value << 8) ) >> 24;
         int32_t b = ((int32_t)(value) ) >> 24;
         dst[0] = (int)r; /* r */
         dst[1] = (int)g; /* g */
         dst[2] = (int)b; /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_x8r8g8b8_sint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)(((int8_t)MIN2(src[0], 127)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)(((int8_t)MIN2(src[1], 127)) & 0xff) << 8) ;
         value |= (uint32_t)(((int8_t)MIN2(src[2], 127)) & 0xff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)(((int8_t)MIN2(src[0], 127)) & 0xff) << 8) ;
         value |= (uint32_t)((uint32_t)(((int8_t)MIN2(src[1], 127)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)((int8_t)MIN2(src[2], 127)) << 24) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a8b8g8r8_unorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = value >> 24;
         uint32_t b = (value >> 16) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t r = (value) & 0xff;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = (value) & 0xff;
         uint32_t b = (value >> 8) & 0xff;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t r = value >> 24;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_a8b8g8r8_unorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(float_to_ubyte(src[3])) << 24;
         value |= (uint32_t)((float_to_ubyte(src[2])) & 0xff) << 16;
         value |= (uint32_t)((float_to_ubyte(src[1])) & 0xff) << 8;
         value |= (float_to_ubyte(src[0])) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (float_to_ubyte(src[3])) & 0xff;
         value |= (uint32_t)((float_to_ubyte(src[2])) & 0xff) << 8;
         value |= (uint32_t)((float_to_ubyte(src[1])) & 0xff) << 16;
         value |= (uint32_t)(float_to_ubyte(src[0])) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a8b8g8r8_unorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = value >> 24;
         uint32_t b = (value >> 16) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t r = (value) & 0xff;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = (value) & 0xff;
         uint32_t b = (value >> 8) & 0xff;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t r = value >> 24;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#endif
}

void
util_format_a8b8g8r8_unorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = value >> 24;
         uint32_t b = (value >> 16) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t r = (value) & 0xff;
         dst[0] = r; /* r */
         dst[1] = g; /* g */
         dst[2] = b; /* b */
         dst[3] = a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = (value) & 0xff;
         uint32_t b = (value >> 8) & 0xff;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t r = value >> 24;
         dst[0] = r; /* r */
         dst[1] = g; /* g */
         dst[2] = b; /* b */
         dst[3] = a; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_a8b8g8r8_unorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(src[3]) << 24;
         value |= (uint32_t)((src[2]) & 0xff) << 16;
         value |= (uint32_t)((src[1]) & 0xff) << 8;
         value |= (src[0]) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (src[3]) & 0xff;
         value |= (uint32_t)((src[2]) & 0xff) << 8;
         value |= (uint32_t)((src[1]) & 0xff) << 16;
         value |= (uint32_t)(src[0]) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_x8b8g8r8_unorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = (value >> 16) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t r = (value) & 0xff;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = (value >> 8) & 0xff;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t r = value >> 24;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = 1; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_x8b8g8r8_unorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((float_to_ubyte(src[2])) & 0xff) << 16;
         value |= (uint32_t)((float_to_ubyte(src[1])) & 0xff) << 8;
         value |= (float_to_ubyte(src[0])) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)((float_to_ubyte(src[2])) & 0xff) << 8;
         value |= (uint32_t)((float_to_ubyte(src[1])) & 0xff) << 16;
         value |= (uint32_t)(float_to_ubyte(src[0])) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_x8b8g8r8_unorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = (value >> 16) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t r = (value) & 0xff;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = (value >> 8) & 0xff;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t r = value >> 24;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_x8b8g8r8_unorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = (value >> 16) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t r = (value) & 0xff;
         dst[0] = r; /* r */
         dst[1] = g; /* g */
         dst[2] = b; /* b */
         dst[3] = 255; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = (value >> 8) & 0xff;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t r = value >> 24;
         dst[0] = r; /* r */
         dst[1] = g; /* g */
         dst[2] = b; /* b */
         dst[3] = 255; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_x8b8g8r8_unorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((src[2]) & 0xff) << 16;
         value |= (uint32_t)((src[1]) & 0xff) << 8;
         value |= (src[0]) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)((src[2]) & 0xff) << 8;
         value |= (uint32_t)((src[1]) & 0xff) << 16;
         value |= (uint32_t)(src[0]) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r8g8b8x8_unorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = value >> 24;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t b = (value >> 8) & 0xff;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = (value) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t b = (value >> 16) & 0xff;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = 1; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_r8g8b8x8_unorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(float_to_ubyte(src[0])) << 24;
         value |= (uint32_t)((float_to_ubyte(src[1])) & 0xff) << 16;
         value |= (uint32_t)((float_to_ubyte(src[2])) & 0xff) << 8;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (float_to_ubyte(src[0])) & 0xff;
         value |= (uint32_t)((float_to_ubyte(src[1])) & 0xff) << 8;
         value |= (uint32_t)((float_to_ubyte(src[2])) & 0xff) << 16;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r8g8b8x8_unorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = value >> 24;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t b = (value >> 8) & 0xff;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = (value) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t b = (value >> 16) & 0xff;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_r8g8b8x8_unorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = value >> 24;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t b = (value >> 8) & 0xff;
         dst[0] = r; /* r */
         dst[1] = g; /* g */
         dst[2] = b; /* b */
         dst[3] = 255; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = (value) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t b = (value >> 16) & 0xff;
         dst[0] = r; /* r */
         dst[1] = g; /* g */
         dst[2] = b; /* b */
         dst[3] = 255; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_r8g8b8x8_unorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(src[0]) << 24;
         value |= (uint32_t)((src[1]) & 0xff) << 16;
         value |= (uint32_t)((src[2]) & 0xff) << 8;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (src[0]) & 0xff;
         value |= (uint32_t)((src[1]) & 0xff) << 8;
         value |= (uint32_t)((src[2]) & 0xff) << 16;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r5g5b5a1_unorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t a = value >> 15;
         uint16_t b = (value >> 10) & 0x1f;
         uint16_t g = (value >> 5) & 0x1f;
         uint16_t r = (value) & 0x1f;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x1f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = (float)(a * (1.0f/0x1)); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = (value) & 0x1f;
         uint16_t g = (value >> 5) & 0x1f;
         uint16_t b = (value >> 10) & 0x1f;
         uint16_t a = value >> 15;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x1f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = (float)(a * (1.0f/0x1)); /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_r5g5b5a1_unorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint16_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0x1)) << 15;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x1f)) & 0x1f) << 10;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x1f)) & 0x1f) << 5;
         value |= ((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x1f)) & 0x1f;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x1f)) & 0x1f;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x1f)) & 0x1f) << 5;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x1f)) & 0x1f) << 10;
         value |= (uint32_t)((uint16_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0x1)) << 15;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r5g5b5a1_unorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t a = value >> 15;
         uint16_t b = (value >> 10) & 0x1f;
         uint16_t g = (value >> 5) & 0x1f;
         uint16_t r = (value) & 0x1f;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x1f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = (float)(a * (1.0f/0x1)); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = (value) & 0x1f;
         uint16_t g = (value >> 5) & 0x1f;
         uint16_t b = (value >> 10) & 0x1f;
         uint16_t a = value >> 15;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x1f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = (float)(a * (1.0f/0x1)); /* a */
#endif
}

void
util_format_r5g5b5a1_unorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t a = value >> 15;
         uint16_t b = (value >> 10) & 0x1f;
         uint16_t g = (value >> 5) & 0x1f;
         uint16_t r = (value) & 0x1f;
         dst[0] = _mesa_unorm_to_unorm(r, 5, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(g, 5, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(b, 5, 8); /* b */
         dst[3] = _mesa_unorm_to_unorm(a, 1, 8); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = (value) & 0x1f;
         uint16_t g = (value >> 5) & 0x1f;
         uint16_t b = (value >> 10) & 0x1f;
         uint16_t a = value >> 15;
         dst[0] = _mesa_unorm_to_unorm(r, 5, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(g, 5, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(b, 5, 8); /* b */
         dst[3] = _mesa_unorm_to_unorm(a, 1, 8); /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_r5g5b5a1_unorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)(_mesa_unorm_to_unorm(src[3], 8, 1)) << 15;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[2], 8, 5)) & 0x1f) << 10;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[1], 8, 5)) & 0x1f) << 5;
         value |= (_mesa_unorm_to_unorm(src[0], 8, 5)) & 0x1f;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (_mesa_unorm_to_unorm(src[0], 8, 5)) & 0x1f;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[1], 8, 5)) & 0x1f) << 5;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[2], 8, 5)) & 0x1f) << 10;
         value |= (uint32_t)(_mesa_unorm_to_unorm(src[3], 8, 1)) << 15;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r5g5b5x1_unorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t b = (value >> 10) & 0x1f;
         uint16_t g = (value >> 5) & 0x1f;
         uint16_t r = (value) & 0x1f;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x1f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = (value) & 0x1f;
         uint16_t g = (value >> 5) & 0x1f;
         uint16_t b = (value >> 10) & 0x1f;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x1f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = 1; /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_r5g5b5x1_unorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x1f)) & 0x1f) << 10;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x1f)) & 0x1f) << 5;
         value |= ((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x1f)) & 0x1f;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x1f)) & 0x1f;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x1f)) & 0x1f) << 5;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x1f)) & 0x1f) << 10;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r5g5b5x1_unorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t b = (value >> 10) & 0x1f;
         uint16_t g = (value >> 5) & 0x1f;
         uint16_t r = (value) & 0x1f;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x1f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = (value) & 0x1f;
         uint16_t g = (value >> 5) & 0x1f;
         uint16_t b = (value >> 10) & 0x1f;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x1f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_r5g5b5x1_unorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t b = (value >> 10) & 0x1f;
         uint16_t g = (value >> 5) & 0x1f;
         uint16_t r = (value) & 0x1f;
         dst[0] = _mesa_unorm_to_unorm(r, 5, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(g, 5, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(b, 5, 8); /* b */
         dst[3] = 255; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = (value) & 0x1f;
         uint16_t g = (value >> 5) & 0x1f;
         uint16_t b = (value >> 10) & 0x1f;
         dst[0] = _mesa_unorm_to_unorm(r, 5, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(g, 5, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(b, 5, 8); /* b */
         dst[3] = 255; /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_r5g5b5x1_unorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[2], 8, 5)) & 0x1f) << 10;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[1], 8, 5)) & 0x1f) << 5;
         value |= (_mesa_unorm_to_unorm(src[0], 8, 5)) & 0x1f;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (_mesa_unorm_to_unorm(src[0], 8, 5)) & 0x1f;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[1], 8, 5)) & 0x1f) << 5;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[2], 8, 5)) & 0x1f) << 10;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b5g5r5x1_unorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = (value >> 10) & 0x1f;
         uint16_t g = (value >> 5) & 0x1f;
         uint16_t b = (value) & 0x1f;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x1f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t b = (value) & 0x1f;
         uint16_t g = (value >> 5) & 0x1f;
         uint16_t r = (value >> 10) & 0x1f;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x1f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = 1; /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_b5g5r5x1_unorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x1f)) & 0x1f) << 10;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x1f)) & 0x1f) << 5;
         value |= ((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x1f)) & 0x1f;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x1f)) & 0x1f;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x1f)) & 0x1f) << 5;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x1f)) & 0x1f) << 10;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b5g5r5x1_unorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = (value >> 10) & 0x1f;
         uint16_t g = (value >> 5) & 0x1f;
         uint16_t b = (value) & 0x1f;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x1f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t b = (value) & 0x1f;
         uint16_t g = (value >> 5) & 0x1f;
         uint16_t r = (value >> 10) & 0x1f;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x1f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_b5g5r5x1_unorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = (value >> 10) & 0x1f;
         uint16_t g = (value >> 5) & 0x1f;
         uint16_t b = (value) & 0x1f;
         dst[0] = _mesa_unorm_to_unorm(r, 5, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(g, 5, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(b, 5, 8); /* b */
         dst[3] = 255; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t b = (value) & 0x1f;
         uint16_t g = (value >> 5) & 0x1f;
         uint16_t r = (value >> 10) & 0x1f;
         dst[0] = _mesa_unorm_to_unorm(r, 5, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(g, 5, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(b, 5, 8); /* b */
         dst[3] = 255; /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_b5g5r5x1_unorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[0], 8, 5)) & 0x1f) << 10;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[1], 8, 5)) & 0x1f) << 5;
         value |= (_mesa_unorm_to_unorm(src[2], 8, 5)) & 0x1f;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (_mesa_unorm_to_unorm(src[2], 8, 5)) & 0x1f;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[1], 8, 5)) & 0x1f) << 5;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[0], 8, 5)) & 0x1f) << 10;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b5g5r5a1_unorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t a = value >> 15;
         uint16_t r = (value >> 10) & 0x1f;
         uint16_t g = (value >> 5) & 0x1f;
         uint16_t b = (value) & 0x1f;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x1f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = (float)(a * (1.0f/0x1)); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t b = (value) & 0x1f;
         uint16_t g = (value >> 5) & 0x1f;
         uint16_t r = (value >> 10) & 0x1f;
         uint16_t a = value >> 15;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x1f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = (float)(a * (1.0f/0x1)); /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_b5g5r5a1_unorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint16_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0x1)) << 15;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x1f)) & 0x1f) << 10;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x1f)) & 0x1f) << 5;
         value |= ((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x1f)) & 0x1f;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x1f)) & 0x1f;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x1f)) & 0x1f) << 5;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x1f)) & 0x1f) << 10;
         value |= (uint32_t)((uint16_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0x1)) << 15;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b5g5r5a1_unorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t a = value >> 15;
         uint16_t r = (value >> 10) & 0x1f;
         uint16_t g = (value >> 5) & 0x1f;
         uint16_t b = (value) & 0x1f;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x1f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = (float)(a * (1.0f/0x1)); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t b = (value) & 0x1f;
         uint16_t g = (value >> 5) & 0x1f;
         uint16_t r = (value >> 10) & 0x1f;
         uint16_t a = value >> 15;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x1f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = (float)(a * (1.0f/0x1)); /* a */
#endif
}

void
util_format_b5g5r5a1_unorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t a = value >> 15;
         uint16_t r = (value >> 10) & 0x1f;
         uint16_t g = (value >> 5) & 0x1f;
         uint16_t b = (value) & 0x1f;
         dst[0] = _mesa_unorm_to_unorm(r, 5, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(g, 5, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(b, 5, 8); /* b */
         dst[3] = _mesa_unorm_to_unorm(a, 1, 8); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t b = (value) & 0x1f;
         uint16_t g = (value >> 5) & 0x1f;
         uint16_t r = (value >> 10) & 0x1f;
         uint16_t a = value >> 15;
         dst[0] = _mesa_unorm_to_unorm(r, 5, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(g, 5, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(b, 5, 8); /* b */
         dst[3] = _mesa_unorm_to_unorm(a, 1, 8); /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_b5g5r5a1_unorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)(_mesa_unorm_to_unorm(src[3], 8, 1)) << 15;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[0], 8, 5)) & 0x1f) << 10;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[1], 8, 5)) & 0x1f) << 5;
         value |= (_mesa_unorm_to_unorm(src[2], 8, 5)) & 0x1f;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (_mesa_unorm_to_unorm(src[2], 8, 5)) & 0x1f;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[1], 8, 5)) & 0x1f) << 5;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[0], 8, 5)) & 0x1f) << 10;
         value |= (uint32_t)(_mesa_unorm_to_unorm(src[3], 8, 1)) << 15;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_x1b5g5r5_unorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = value >> 11;
         uint16_t g = (value >> 6) & 0x1f;
         uint16_t b = (value >> 1) & 0x1f;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x1f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t b = (value >> 1) & 0x1f;
         uint16_t g = (value >> 6) & 0x1f;
         uint16_t r = value >> 11;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x1f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = 1; /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_x1b5g5r5_unorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x1f)) << 11;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x1f)) & 0x1f) << 6;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x1f)) & 0x1f) << 1;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x1f)) & 0x1f) << 1;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x1f)) & 0x1f) << 6;
         value |= (uint32_t)((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x1f)) << 11;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_x1b5g5r5_unorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = value >> 11;
         uint16_t g = (value >> 6) & 0x1f;
         uint16_t b = (value >> 1) & 0x1f;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x1f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t b = (value >> 1) & 0x1f;
         uint16_t g = (value >> 6) & 0x1f;
         uint16_t r = value >> 11;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x1f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_x1b5g5r5_unorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = value >> 11;
         uint16_t g = (value >> 6) & 0x1f;
         uint16_t b = (value >> 1) & 0x1f;
         dst[0] = _mesa_unorm_to_unorm(r, 5, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(g, 5, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(b, 5, 8); /* b */
         dst[3] = 255; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t b = (value >> 1) & 0x1f;
         uint16_t g = (value >> 6) & 0x1f;
         uint16_t r = value >> 11;
         dst[0] = _mesa_unorm_to_unorm(r, 5, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(g, 5, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(b, 5, 8); /* b */
         dst[3] = 255; /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_x1b5g5r5_unorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)(_mesa_unorm_to_unorm(src[0], 8, 5)) << 11;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[1], 8, 5)) & 0x1f) << 6;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[2], 8, 5)) & 0x1f) << 1;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[2], 8, 5)) & 0x1f) << 1;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[1], 8, 5)) & 0x1f) << 6;
         value |= (uint32_t)(_mesa_unorm_to_unorm(src[0], 8, 5)) << 11;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a1r5g5b5_unorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t b = value >> 11;
         uint16_t g = (value >> 6) & 0x1f;
         uint16_t r = (value >> 1) & 0x1f;
         uint16_t a = (value) & 0x1;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x1f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = (float)(a * (1.0f/0x1)); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t a = (value) & 0x1;
         uint16_t r = (value >> 1) & 0x1f;
         uint16_t g = (value >> 6) & 0x1f;
         uint16_t b = value >> 11;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x1f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = (float)(a * (1.0f/0x1)); /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_a1r5g5b5_unorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x1f)) << 11;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x1f)) & 0x1f) << 6;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x1f)) & 0x1f) << 1;
         value |= ((uint16_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0x1)) & 0x1;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint16_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0x1)) & 0x1;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x1f)) & 0x1f) << 1;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x1f)) & 0x1f) << 6;
         value |= (uint32_t)((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x1f)) << 11;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a1r5g5b5_unorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t b = value >> 11;
         uint16_t g = (value >> 6) & 0x1f;
         uint16_t r = (value >> 1) & 0x1f;
         uint16_t a = (value) & 0x1;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x1f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = (float)(a * (1.0f/0x1)); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t a = (value) & 0x1;
         uint16_t r = (value >> 1) & 0x1f;
         uint16_t g = (value >> 6) & 0x1f;
         uint16_t b = value >> 11;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x1f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = (float)(a * (1.0f/0x1)); /* a */
#endif
}

void
util_format_a1r5g5b5_unorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t b = value >> 11;
         uint16_t g = (value >> 6) & 0x1f;
         uint16_t r = (value >> 1) & 0x1f;
         uint16_t a = (value) & 0x1;
         dst[0] = _mesa_unorm_to_unorm(r, 5, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(g, 5, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(b, 5, 8); /* b */
         dst[3] = _mesa_unorm_to_unorm(a, 1, 8); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t a = (value) & 0x1;
         uint16_t r = (value >> 1) & 0x1f;
         uint16_t g = (value >> 6) & 0x1f;
         uint16_t b = value >> 11;
         dst[0] = _mesa_unorm_to_unorm(r, 5, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(g, 5, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(b, 5, 8); /* b */
         dst[3] = _mesa_unorm_to_unorm(a, 1, 8); /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_a1r5g5b5_unorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)(_mesa_unorm_to_unorm(src[2], 8, 5)) << 11;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[1], 8, 5)) & 0x1f) << 6;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[0], 8, 5)) & 0x1f) << 1;
         value |= (_mesa_unorm_to_unorm(src[3], 8, 1)) & 0x1;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (_mesa_unorm_to_unorm(src[3], 8, 1)) & 0x1;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[0], 8, 5)) & 0x1f) << 1;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[1], 8, 5)) & 0x1f) << 6;
         value |= (uint32_t)(_mesa_unorm_to_unorm(src[2], 8, 5)) << 11;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_x1r5g5b5_unorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t b = value >> 11;
         uint16_t g = (value >> 6) & 0x1f;
         uint16_t r = (value >> 1) & 0x1f;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x1f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = (value >> 1) & 0x1f;
         uint16_t g = (value >> 6) & 0x1f;
         uint16_t b = value >> 11;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x1f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = 1; /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_x1r5g5b5_unorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x1f)) << 11;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x1f)) & 0x1f) << 6;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x1f)) & 0x1f) << 1;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x1f)) & 0x1f) << 1;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x1f)) & 0x1f) << 6;
         value |= (uint32_t)((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x1f)) << 11;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_x1r5g5b5_unorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t b = value >> 11;
         uint16_t g = (value >> 6) & 0x1f;
         uint16_t r = (value >> 1) & 0x1f;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x1f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = (value >> 1) & 0x1f;
         uint16_t g = (value >> 6) & 0x1f;
         uint16_t b = value >> 11;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x1f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_x1r5g5b5_unorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t b = value >> 11;
         uint16_t g = (value >> 6) & 0x1f;
         uint16_t r = (value >> 1) & 0x1f;
         dst[0] = _mesa_unorm_to_unorm(r, 5, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(g, 5, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(b, 5, 8); /* b */
         dst[3] = 255; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = (value >> 1) & 0x1f;
         uint16_t g = (value >> 6) & 0x1f;
         uint16_t b = value >> 11;
         dst[0] = _mesa_unorm_to_unorm(r, 5, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(g, 5, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(b, 5, 8); /* b */
         dst[3] = 255; /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_x1r5g5b5_unorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)(_mesa_unorm_to_unorm(src[2], 8, 5)) << 11;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[1], 8, 5)) & 0x1f) << 6;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[0], 8, 5)) & 0x1f) << 1;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[0], 8, 5)) & 0x1f) << 1;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[1], 8, 5)) & 0x1f) << 6;
         value |= (uint32_t)(_mesa_unorm_to_unorm(src[2], 8, 5)) << 11;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a1b5g5r5_unorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = value >> 11;
         uint16_t g = (value >> 6) & 0x1f;
         uint16_t b = (value >> 1) & 0x1f;
         uint16_t a = (value) & 0x1;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x1f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = (float)(a * (1.0f/0x1)); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t a = (value) & 0x1;
         uint16_t b = (value >> 1) & 0x1f;
         uint16_t g = (value >> 6) & 0x1f;
         uint16_t r = value >> 11;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x1f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = (float)(a * (1.0f/0x1)); /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_a1b5g5r5_unorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x1f)) << 11;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x1f)) & 0x1f) << 6;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x1f)) & 0x1f) << 1;
         value |= ((uint16_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0x1)) & 0x1;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint16_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0x1)) & 0x1;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x1f)) & 0x1f) << 1;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x1f)) & 0x1f) << 6;
         value |= (uint32_t)((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x1f)) << 11;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a1b5g5r5_unorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = value >> 11;
         uint16_t g = (value >> 6) & 0x1f;
         uint16_t b = (value >> 1) & 0x1f;
         uint16_t a = (value) & 0x1;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x1f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = (float)(a * (1.0f/0x1)); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t a = (value) & 0x1;
         uint16_t b = (value >> 1) & 0x1f;
         uint16_t g = (value >> 6) & 0x1f;
         uint16_t r = value >> 11;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x1f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = (float)(a * (1.0f/0x1)); /* a */
#endif
}

void
util_format_a1b5g5r5_unorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = value >> 11;
         uint16_t g = (value >> 6) & 0x1f;
         uint16_t b = (value >> 1) & 0x1f;
         uint16_t a = (value) & 0x1;
         dst[0] = _mesa_unorm_to_unorm(r, 5, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(g, 5, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(b, 5, 8); /* b */
         dst[3] = _mesa_unorm_to_unorm(a, 1, 8); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t a = (value) & 0x1;
         uint16_t b = (value >> 1) & 0x1f;
         uint16_t g = (value >> 6) & 0x1f;
         uint16_t r = value >> 11;
         dst[0] = _mesa_unorm_to_unorm(r, 5, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(g, 5, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(b, 5, 8); /* b */
         dst[3] = _mesa_unorm_to_unorm(a, 1, 8); /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_a1b5g5r5_unorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)(_mesa_unorm_to_unorm(src[0], 8, 5)) << 11;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[1], 8, 5)) & 0x1f) << 6;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[2], 8, 5)) & 0x1f) << 1;
         value |= (_mesa_unorm_to_unorm(src[3], 8, 1)) & 0x1;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (_mesa_unorm_to_unorm(src[3], 8, 1)) & 0x1;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[2], 8, 5)) & 0x1f) << 1;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[1], 8, 5)) & 0x1f) << 6;
         value |= (uint32_t)(_mesa_unorm_to_unorm(src[0], 8, 5)) << 11;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r4g4b4a4_unorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t a = value >> 12;
         uint16_t b = (value >> 8) & 0xf;
         uint16_t g = (value >> 4) & 0xf;
         uint16_t r = (value) & 0xf;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = (float)(g * (1.0f/0xf)); /* g */
         dst[2] = (float)(b * (1.0f/0xf)); /* b */
         dst[3] = (float)(a * (1.0f/0xf)); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = (value) & 0xf;
         uint16_t g = (value >> 4) & 0xf;
         uint16_t b = (value >> 8) & 0xf;
         uint16_t a = value >> 12;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = (float)(g * (1.0f/0xf)); /* g */
         dst[2] = (float)(b * (1.0f/0xf)); /* b */
         dst[3] = (float)(a * (1.0f/0xf)); /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_r4g4b4a4_unorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint16_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0xf)) << 12;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0xf)) & 0xf) << 8;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0xf)) & 0xf) << 4;
         value |= ((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0xf)) & 0xf;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0xf)) & 0xf;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0xf)) & 0xf) << 4;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0xf)) & 0xf) << 8;
         value |= (uint32_t)((uint16_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0xf)) << 12;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r4g4b4a4_unorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t a = value >> 12;
         uint16_t b = (value >> 8) & 0xf;
         uint16_t g = (value >> 4) & 0xf;
         uint16_t r = (value) & 0xf;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = (float)(g * (1.0f/0xf)); /* g */
         dst[2] = (float)(b * (1.0f/0xf)); /* b */
         dst[3] = (float)(a * (1.0f/0xf)); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = (value) & 0xf;
         uint16_t g = (value >> 4) & 0xf;
         uint16_t b = (value >> 8) & 0xf;
         uint16_t a = value >> 12;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = (float)(g * (1.0f/0xf)); /* g */
         dst[2] = (float)(b * (1.0f/0xf)); /* b */
         dst[3] = (float)(a * (1.0f/0xf)); /* a */
#endif
}

void
util_format_r4g4b4a4_unorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t a = value >> 12;
         uint16_t b = (value >> 8) & 0xf;
         uint16_t g = (value >> 4) & 0xf;
         uint16_t r = (value) & 0xf;
         dst[0] = _mesa_unorm_to_unorm(r, 4, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(g, 4, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(b, 4, 8); /* b */
         dst[3] = _mesa_unorm_to_unorm(a, 4, 8); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = (value) & 0xf;
         uint16_t g = (value >> 4) & 0xf;
         uint16_t b = (value >> 8) & 0xf;
         uint16_t a = value >> 12;
         dst[0] = _mesa_unorm_to_unorm(r, 4, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(g, 4, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(b, 4, 8); /* b */
         dst[3] = _mesa_unorm_to_unorm(a, 4, 8); /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_r4g4b4a4_unorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)(_mesa_unorm_to_unorm(src[3], 8, 4)) << 12;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[2], 8, 4)) & 0xf) << 8;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[1], 8, 4)) & 0xf) << 4;
         value |= (_mesa_unorm_to_unorm(src[0], 8, 4)) & 0xf;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (_mesa_unorm_to_unorm(src[0], 8, 4)) & 0xf;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[1], 8, 4)) & 0xf) << 4;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[2], 8, 4)) & 0xf) << 8;
         value |= (uint32_t)(_mesa_unorm_to_unorm(src[3], 8, 4)) << 12;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r4g4b4x4_unorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t b = (value >> 8) & 0xf;
         uint16_t g = (value >> 4) & 0xf;
         uint16_t r = (value) & 0xf;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = (float)(g * (1.0f/0xf)); /* g */
         dst[2] = (float)(b * (1.0f/0xf)); /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = (value) & 0xf;
         uint16_t g = (value >> 4) & 0xf;
         uint16_t b = (value >> 8) & 0xf;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = (float)(g * (1.0f/0xf)); /* g */
         dst[2] = (float)(b * (1.0f/0xf)); /* b */
         dst[3] = 1; /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_r4g4b4x4_unorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0xf)) & 0xf) << 8;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0xf)) & 0xf) << 4;
         value |= ((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0xf)) & 0xf;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0xf)) & 0xf;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0xf)) & 0xf) << 4;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0xf)) & 0xf) << 8;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r4g4b4x4_unorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t b = (value >> 8) & 0xf;
         uint16_t g = (value >> 4) & 0xf;
         uint16_t r = (value) & 0xf;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = (float)(g * (1.0f/0xf)); /* g */
         dst[2] = (float)(b * (1.0f/0xf)); /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = (value) & 0xf;
         uint16_t g = (value >> 4) & 0xf;
         uint16_t b = (value >> 8) & 0xf;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = (float)(g * (1.0f/0xf)); /* g */
         dst[2] = (float)(b * (1.0f/0xf)); /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_r4g4b4x4_unorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t b = (value >> 8) & 0xf;
         uint16_t g = (value >> 4) & 0xf;
         uint16_t r = (value) & 0xf;
         dst[0] = _mesa_unorm_to_unorm(r, 4, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(g, 4, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(b, 4, 8); /* b */
         dst[3] = 255; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = (value) & 0xf;
         uint16_t g = (value >> 4) & 0xf;
         uint16_t b = (value >> 8) & 0xf;
         dst[0] = _mesa_unorm_to_unorm(r, 4, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(g, 4, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(b, 4, 8); /* b */
         dst[3] = 255; /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_r4g4b4x4_unorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[2], 8, 4)) & 0xf) << 8;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[1], 8, 4)) & 0xf) << 4;
         value |= (_mesa_unorm_to_unorm(src[0], 8, 4)) & 0xf;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (_mesa_unorm_to_unorm(src[0], 8, 4)) & 0xf;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[1], 8, 4)) & 0xf) << 4;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[2], 8, 4)) & 0xf) << 8;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b4g4r4a4_unorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t a = value >> 12;
         uint16_t r = (value >> 8) & 0xf;
         uint16_t g = (value >> 4) & 0xf;
         uint16_t b = (value) & 0xf;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = (float)(g * (1.0f/0xf)); /* g */
         dst[2] = (float)(b * (1.0f/0xf)); /* b */
         dst[3] = (float)(a * (1.0f/0xf)); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t b = (value) & 0xf;
         uint16_t g = (value >> 4) & 0xf;
         uint16_t r = (value >> 8) & 0xf;
         uint16_t a = value >> 12;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = (float)(g * (1.0f/0xf)); /* g */
         dst[2] = (float)(b * (1.0f/0xf)); /* b */
         dst[3] = (float)(a * (1.0f/0xf)); /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_b4g4r4a4_unorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint16_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0xf)) << 12;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0xf)) & 0xf) << 8;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0xf)) & 0xf) << 4;
         value |= ((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0xf)) & 0xf;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0xf)) & 0xf;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0xf)) & 0xf) << 4;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0xf)) & 0xf) << 8;
         value |= (uint32_t)((uint16_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0xf)) << 12;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b4g4r4a4_unorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t a = value >> 12;
         uint16_t r = (value >> 8) & 0xf;
         uint16_t g = (value >> 4) & 0xf;
         uint16_t b = (value) & 0xf;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = (float)(g * (1.0f/0xf)); /* g */
         dst[2] = (float)(b * (1.0f/0xf)); /* b */
         dst[3] = (float)(a * (1.0f/0xf)); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t b = (value) & 0xf;
         uint16_t g = (value >> 4) & 0xf;
         uint16_t r = (value >> 8) & 0xf;
         uint16_t a = value >> 12;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = (float)(g * (1.0f/0xf)); /* g */
         dst[2] = (float)(b * (1.0f/0xf)); /* b */
         dst[3] = (float)(a * (1.0f/0xf)); /* a */
#endif
}

void
util_format_b4g4r4a4_unorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t a = value >> 12;
         uint16_t r = (value >> 8) & 0xf;
         uint16_t g = (value >> 4) & 0xf;
         uint16_t b = (value) & 0xf;
         dst[0] = _mesa_unorm_to_unorm(r, 4, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(g, 4, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(b, 4, 8); /* b */
         dst[3] = _mesa_unorm_to_unorm(a, 4, 8); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t b = (value) & 0xf;
         uint16_t g = (value >> 4) & 0xf;
         uint16_t r = (value >> 8) & 0xf;
         uint16_t a = value >> 12;
         dst[0] = _mesa_unorm_to_unorm(r, 4, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(g, 4, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(b, 4, 8); /* b */
         dst[3] = _mesa_unorm_to_unorm(a, 4, 8); /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_b4g4r4a4_unorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)(_mesa_unorm_to_unorm(src[3], 8, 4)) << 12;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[0], 8, 4)) & 0xf) << 8;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[1], 8, 4)) & 0xf) << 4;
         value |= (_mesa_unorm_to_unorm(src[2], 8, 4)) & 0xf;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (_mesa_unorm_to_unorm(src[2], 8, 4)) & 0xf;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[1], 8, 4)) & 0xf) << 4;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[0], 8, 4)) & 0xf) << 8;
         value |= (uint32_t)(_mesa_unorm_to_unorm(src[3], 8, 4)) << 12;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b4g4r4x4_unorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = (value >> 8) & 0xf;
         uint16_t g = (value >> 4) & 0xf;
         uint16_t b = (value) & 0xf;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = (float)(g * (1.0f/0xf)); /* g */
         dst[2] = (float)(b * (1.0f/0xf)); /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t b = (value) & 0xf;
         uint16_t g = (value >> 4) & 0xf;
         uint16_t r = (value >> 8) & 0xf;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = (float)(g * (1.0f/0xf)); /* g */
         dst[2] = (float)(b * (1.0f/0xf)); /* b */
         dst[3] = 1; /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_b4g4r4x4_unorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0xf)) & 0xf) << 8;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0xf)) & 0xf) << 4;
         value |= ((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0xf)) & 0xf;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0xf)) & 0xf;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0xf)) & 0xf) << 4;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0xf)) & 0xf) << 8;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b4g4r4x4_unorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = (value >> 8) & 0xf;
         uint16_t g = (value >> 4) & 0xf;
         uint16_t b = (value) & 0xf;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = (float)(g * (1.0f/0xf)); /* g */
         dst[2] = (float)(b * (1.0f/0xf)); /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t b = (value) & 0xf;
         uint16_t g = (value >> 4) & 0xf;
         uint16_t r = (value >> 8) & 0xf;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = (float)(g * (1.0f/0xf)); /* g */
         dst[2] = (float)(b * (1.0f/0xf)); /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_b4g4r4x4_unorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = (value >> 8) & 0xf;
         uint16_t g = (value >> 4) & 0xf;
         uint16_t b = (value) & 0xf;
         dst[0] = _mesa_unorm_to_unorm(r, 4, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(g, 4, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(b, 4, 8); /* b */
         dst[3] = 255; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t b = (value) & 0xf;
         uint16_t g = (value >> 4) & 0xf;
         uint16_t r = (value >> 8) & 0xf;
         dst[0] = _mesa_unorm_to_unorm(r, 4, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(g, 4, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(b, 4, 8); /* b */
         dst[3] = 255; /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_b4g4r4x4_unorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[0], 8, 4)) & 0xf) << 8;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[1], 8, 4)) & 0xf) << 4;
         value |= (_mesa_unorm_to_unorm(src[2], 8, 4)) & 0xf;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (_mesa_unorm_to_unorm(src[2], 8, 4)) & 0xf;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[1], 8, 4)) & 0xf) << 4;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[0], 8, 4)) & 0xf) << 8;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a4r4g4b4_unorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t b = value >> 12;
         uint16_t g = (value >> 8) & 0xf;
         uint16_t r = (value >> 4) & 0xf;
         uint16_t a = (value) & 0xf;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = (float)(g * (1.0f/0xf)); /* g */
         dst[2] = (float)(b * (1.0f/0xf)); /* b */
         dst[3] = (float)(a * (1.0f/0xf)); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t a = (value) & 0xf;
         uint16_t r = (value >> 4) & 0xf;
         uint16_t g = (value >> 8) & 0xf;
         uint16_t b = value >> 12;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = (float)(g * (1.0f/0xf)); /* g */
         dst[2] = (float)(b * (1.0f/0xf)); /* b */
         dst[3] = (float)(a * (1.0f/0xf)); /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_a4r4g4b4_unorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0xf)) << 12;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0xf)) & 0xf) << 8;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0xf)) & 0xf) << 4;
         value |= ((uint16_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0xf)) & 0xf;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint16_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0xf)) & 0xf;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0xf)) & 0xf) << 4;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0xf)) & 0xf) << 8;
         value |= (uint32_t)((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0xf)) << 12;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a4r4g4b4_unorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t b = value >> 12;
         uint16_t g = (value >> 8) & 0xf;
         uint16_t r = (value >> 4) & 0xf;
         uint16_t a = (value) & 0xf;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = (float)(g * (1.0f/0xf)); /* g */
         dst[2] = (float)(b * (1.0f/0xf)); /* b */
         dst[3] = (float)(a * (1.0f/0xf)); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t a = (value) & 0xf;
         uint16_t r = (value >> 4) & 0xf;
         uint16_t g = (value >> 8) & 0xf;
         uint16_t b = value >> 12;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = (float)(g * (1.0f/0xf)); /* g */
         dst[2] = (float)(b * (1.0f/0xf)); /* b */
         dst[3] = (float)(a * (1.0f/0xf)); /* a */
#endif
}

void
util_format_a4r4g4b4_unorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t b = value >> 12;
         uint16_t g = (value >> 8) & 0xf;
         uint16_t r = (value >> 4) & 0xf;
         uint16_t a = (value) & 0xf;
         dst[0] = _mesa_unorm_to_unorm(r, 4, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(g, 4, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(b, 4, 8); /* b */
         dst[3] = _mesa_unorm_to_unorm(a, 4, 8); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t a = (value) & 0xf;
         uint16_t r = (value >> 4) & 0xf;
         uint16_t g = (value >> 8) & 0xf;
         uint16_t b = value >> 12;
         dst[0] = _mesa_unorm_to_unorm(r, 4, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(g, 4, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(b, 4, 8); /* b */
         dst[3] = _mesa_unorm_to_unorm(a, 4, 8); /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_a4r4g4b4_unorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)(_mesa_unorm_to_unorm(src[2], 8, 4)) << 12;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[1], 8, 4)) & 0xf) << 8;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[0], 8, 4)) & 0xf) << 4;
         value |= (_mesa_unorm_to_unorm(src[3], 8, 4)) & 0xf;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (_mesa_unorm_to_unorm(src[3], 8, 4)) & 0xf;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[0], 8, 4)) & 0xf) << 4;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[1], 8, 4)) & 0xf) << 8;
         value |= (uint32_t)(_mesa_unorm_to_unorm(src[2], 8, 4)) << 12;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a4b4g4r4_unorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = value >> 12;
         uint16_t g = (value >> 8) & 0xf;
         uint16_t b = (value >> 4) & 0xf;
         uint16_t a = (value) & 0xf;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = (float)(g * (1.0f/0xf)); /* g */
         dst[2] = (float)(b * (1.0f/0xf)); /* b */
         dst[3] = (float)(a * (1.0f/0xf)); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t a = (value) & 0xf;
         uint16_t b = (value >> 4) & 0xf;
         uint16_t g = (value >> 8) & 0xf;
         uint16_t r = value >> 12;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = (float)(g * (1.0f/0xf)); /* g */
         dst[2] = (float)(b * (1.0f/0xf)); /* b */
         dst[3] = (float)(a * (1.0f/0xf)); /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_a4b4g4r4_unorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0xf)) << 12;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0xf)) & 0xf) << 8;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0xf)) & 0xf) << 4;
         value |= ((uint16_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0xf)) & 0xf;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint16_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0xf)) & 0xf;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0xf)) & 0xf) << 4;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0xf)) & 0xf) << 8;
         value |= (uint32_t)((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0xf)) << 12;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a4b4g4r4_unorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = value >> 12;
         uint16_t g = (value >> 8) & 0xf;
         uint16_t b = (value >> 4) & 0xf;
         uint16_t a = (value) & 0xf;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = (float)(g * (1.0f/0xf)); /* g */
         dst[2] = (float)(b * (1.0f/0xf)); /* b */
         dst[3] = (float)(a * (1.0f/0xf)); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t a = (value) & 0xf;
         uint16_t b = (value >> 4) & 0xf;
         uint16_t g = (value >> 8) & 0xf;
         uint16_t r = value >> 12;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = (float)(g * (1.0f/0xf)); /* g */
         dst[2] = (float)(b * (1.0f/0xf)); /* b */
         dst[3] = (float)(a * (1.0f/0xf)); /* a */
#endif
}

void
util_format_a4b4g4r4_unorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = value >> 12;
         uint16_t g = (value >> 8) & 0xf;
         uint16_t b = (value >> 4) & 0xf;
         uint16_t a = (value) & 0xf;
         dst[0] = _mesa_unorm_to_unorm(r, 4, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(g, 4, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(b, 4, 8); /* b */
         dst[3] = _mesa_unorm_to_unorm(a, 4, 8); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t a = (value) & 0xf;
         uint16_t b = (value >> 4) & 0xf;
         uint16_t g = (value >> 8) & 0xf;
         uint16_t r = value >> 12;
         dst[0] = _mesa_unorm_to_unorm(r, 4, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(g, 4, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(b, 4, 8); /* b */
         dst[3] = _mesa_unorm_to_unorm(a, 4, 8); /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_a4b4g4r4_unorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)(_mesa_unorm_to_unorm(src[0], 8, 4)) << 12;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[1], 8, 4)) & 0xf) << 8;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[2], 8, 4)) & 0xf) << 4;
         value |= (_mesa_unorm_to_unorm(src[3], 8, 4)) & 0xf;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (_mesa_unorm_to_unorm(src[3], 8, 4)) & 0xf;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[2], 8, 4)) & 0xf) << 4;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[1], 8, 4)) & 0xf) << 8;
         value |= (uint32_t)(_mesa_unorm_to_unorm(src[0], 8, 4)) << 12;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r5g6b5_unorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t b = value >> 11;
         uint16_t g = (value >> 5) & 0x3f;
         uint16_t r = (value) & 0x1f;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x3f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = (value) & 0x1f;
         uint16_t g = (value >> 5) & 0x3f;
         uint16_t b = value >> 11;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x3f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = 1; /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_r5g6b5_unorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x1f)) << 11;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x3f)) & 0x3f) << 5;
         value |= ((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x1f)) & 0x1f;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x1f)) & 0x1f;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x3f)) & 0x3f) << 5;
         value |= (uint32_t)((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x1f)) << 11;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r5g6b5_unorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t b = value >> 11;
         uint16_t g = (value >> 5) & 0x3f;
         uint16_t r = (value) & 0x1f;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x3f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = (value) & 0x1f;
         uint16_t g = (value >> 5) & 0x3f;
         uint16_t b = value >> 11;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x3f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_r5g6b5_unorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t b = value >> 11;
         uint16_t g = (value >> 5) & 0x3f;
         uint16_t r = (value) & 0x1f;
         dst[0] = _mesa_unorm_to_unorm(r, 5, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(g, 6, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(b, 5, 8); /* b */
         dst[3] = 255; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = (value) & 0x1f;
         uint16_t g = (value >> 5) & 0x3f;
         uint16_t b = value >> 11;
         dst[0] = _mesa_unorm_to_unorm(r, 5, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(g, 6, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(b, 5, 8); /* b */
         dst[3] = 255; /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_r5g6b5_unorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)(_mesa_unorm_to_unorm(src[2], 8, 5)) << 11;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[1], 8, 6)) & 0x3f) << 5;
         value |= (_mesa_unorm_to_unorm(src[0], 8, 5)) & 0x1f;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (_mesa_unorm_to_unorm(src[0], 8, 5)) & 0x1f;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[1], 8, 6)) & 0x3f) << 5;
         value |= (uint32_t)(_mesa_unorm_to_unorm(src[2], 8, 5)) << 11;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b5g6r5_unorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = value >> 11;
         uint16_t g = (value >> 5) & 0x3f;
         uint16_t b = (value) & 0x1f;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x3f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t b = (value) & 0x1f;
         uint16_t g = (value >> 5) & 0x3f;
         uint16_t r = value >> 11;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x3f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = 1; /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_b5g6r5_unorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x1f)) << 11;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x3f)) & 0x3f) << 5;
         value |= ((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x1f)) & 0x1f;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x1f)) & 0x1f;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x3f)) & 0x3f) << 5;
         value |= (uint32_t)((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x1f)) << 11;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b5g6r5_unorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = value >> 11;
         uint16_t g = (value >> 5) & 0x3f;
         uint16_t b = (value) & 0x1f;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x3f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t b = (value) & 0x1f;
         uint16_t g = (value >> 5) & 0x3f;
         uint16_t r = value >> 11;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x3f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_b5g6r5_unorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = value >> 11;
         uint16_t g = (value >> 5) & 0x3f;
         uint16_t b = (value) & 0x1f;
         dst[0] = _mesa_unorm_to_unorm(r, 5, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(g, 6, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(b, 5, 8); /* b */
         dst[3] = 255; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t b = (value) & 0x1f;
         uint16_t g = (value >> 5) & 0x3f;
         uint16_t r = value >> 11;
         dst[0] = _mesa_unorm_to_unorm(r, 5, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(g, 6, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(b, 5, 8); /* b */
         dst[3] = 255; /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_b5g6r5_unorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)(_mesa_unorm_to_unorm(src[0], 8, 5)) << 11;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[1], 8, 6)) & 0x3f) << 5;
         value |= (_mesa_unorm_to_unorm(src[2], 8, 5)) & 0x1f;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (_mesa_unorm_to_unorm(src[2], 8, 5)) & 0x1f;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[1], 8, 6)) & 0x3f) << 5;
         value |= (uint32_t)(_mesa_unorm_to_unorm(src[0], 8, 5)) << 11;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r10g10b10a2_unorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = value >> 30;
         uint32_t b = (value >> 20) & 0x3ff;
         uint32_t g = (value >> 10) & 0x3ff;
         uint32_t r = (value) & 0x3ff;
         dst[0] = (float)(r * (1.0f/0x3ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x3ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x3ff)); /* b */
         dst[3] = (float)(a * (1.0f/0x3)); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = (value) & 0x3ff;
         uint32_t g = (value >> 10) & 0x3ff;
         uint32_t b = (value >> 20) & 0x3ff;
         uint32_t a = value >> 30;
         dst[0] = (float)(r * (1.0f/0x3ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x3ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x3ff)); /* b */
         dst[3] = (float)(a * (1.0f/0x3)); /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_r10g10b10a2_unorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0x3)) << 30;
         value |= (uint32_t)(((uint32_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x3ff)) & 0x3ff) << 20;
         value |= (uint32_t)(((uint32_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x3ff)) & 0x3ff) << 10;
         value |= ((uint32_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x3ff)) & 0x3ff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint32_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x3ff)) & 0x3ff;
         value |= (uint32_t)(((uint32_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x3ff)) & 0x3ff) << 10;
         value |= (uint32_t)(((uint32_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x3ff)) & 0x3ff) << 20;
         value |= (uint32_t)((uint32_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0x3)) << 30;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r10g10b10a2_unorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = value >> 30;
         uint32_t b = (value >> 20) & 0x3ff;
         uint32_t g = (value >> 10) & 0x3ff;
         uint32_t r = (value) & 0x3ff;
         dst[0] = (float)(r * (1.0f/0x3ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x3ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x3ff)); /* b */
         dst[3] = (float)(a * (1.0f/0x3)); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = (value) & 0x3ff;
         uint32_t g = (value >> 10) & 0x3ff;
         uint32_t b = (value >> 20) & 0x3ff;
         uint32_t a = value >> 30;
         dst[0] = (float)(r * (1.0f/0x3ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x3ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x3ff)); /* b */
         dst[3] = (float)(a * (1.0f/0x3)); /* a */
#endif
}

void
util_format_r10g10b10a2_unorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = value >> 30;
         uint32_t b = (value >> 20) & 0x3ff;
         uint32_t g = (value >> 10) & 0x3ff;
         uint32_t r = (value) & 0x3ff;
         dst[0] = _mesa_unorm_to_unorm(r, 10, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(g, 10, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(b, 10, 8); /* b */
         dst[3] = _mesa_unorm_to_unorm(a, 2, 8); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = (value) & 0x3ff;
         uint32_t g = (value >> 10) & 0x3ff;
         uint32_t b = (value >> 20) & 0x3ff;
         uint32_t a = value >> 30;
         dst[0] = _mesa_unorm_to_unorm(r, 10, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(g, 10, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(b, 10, 8); /* b */
         dst[3] = _mesa_unorm_to_unorm(a, 2, 8); /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_r10g10b10a2_unorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(_mesa_unorm_to_unorm(src[3], 8, 2)) << 30;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[2], 8, 10)) & 0x3ff) << 20;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[1], 8, 10)) & 0x3ff) << 10;
         value |= (_mesa_unorm_to_unorm(src[0], 8, 10)) & 0x3ff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (_mesa_unorm_to_unorm(src[0], 8, 10)) & 0x3ff;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[1], 8, 10)) & 0x3ff) << 10;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[2], 8, 10)) & 0x3ff) << 20;
         value |= (uint32_t)(_mesa_unorm_to_unorm(src[3], 8, 2)) << 30;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r10g10b10x2_unorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = (value >> 20) & 0x3ff;
         uint32_t g = (value >> 10) & 0x3ff;
         uint32_t r = (value) & 0x3ff;
         dst[0] = (float)(r * (1.0f/0x3ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x3ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x3ff)); /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = (value) & 0x3ff;
         uint32_t g = (value >> 10) & 0x3ff;
         uint32_t b = (value >> 20) & 0x3ff;
         dst[0] = (float)(r * (1.0f/0x3ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x3ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x3ff)); /* b */
         dst[3] = 1; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_r10g10b10x2_unorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(((uint32_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x3ff)) & 0x3ff) << 20;
         value |= (uint32_t)(((uint32_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x3ff)) & 0x3ff) << 10;
         value |= ((uint32_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x3ff)) & 0x3ff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint32_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x3ff)) & 0x3ff;
         value |= (uint32_t)(((uint32_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x3ff)) & 0x3ff) << 10;
         value |= (uint32_t)(((uint32_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x3ff)) & 0x3ff) << 20;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r10g10b10x2_unorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = (value >> 20) & 0x3ff;
         uint32_t g = (value >> 10) & 0x3ff;
         uint32_t r = (value) & 0x3ff;
         dst[0] = (float)(r * (1.0f/0x3ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x3ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x3ff)); /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = (value) & 0x3ff;
         uint32_t g = (value >> 10) & 0x3ff;
         uint32_t b = (value >> 20) & 0x3ff;
         dst[0] = (float)(r * (1.0f/0x3ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x3ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x3ff)); /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_r10g10b10x2_unorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = (value >> 20) & 0x3ff;
         uint32_t g = (value >> 10) & 0x3ff;
         uint32_t r = (value) & 0x3ff;
         dst[0] = _mesa_unorm_to_unorm(r, 10, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(g, 10, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(b, 10, 8); /* b */
         dst[3] = 255; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = (value) & 0x3ff;
         uint32_t g = (value >> 10) & 0x3ff;
         uint32_t b = (value >> 20) & 0x3ff;
         dst[0] = _mesa_unorm_to_unorm(r, 10, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(g, 10, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(b, 10, 8); /* b */
         dst[3] = 255; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_r10g10b10x2_unorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[2], 8, 10)) & 0x3ff) << 20;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[1], 8, 10)) & 0x3ff) << 10;
         value |= (_mesa_unorm_to_unorm(src[0], 8, 10)) & 0x3ff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (_mesa_unorm_to_unorm(src[0], 8, 10)) & 0x3ff;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[1], 8, 10)) & 0x3ff) << 10;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[2], 8, 10)) & 0x3ff) << 20;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b10g10r10a2_unorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = value >> 30;
         uint32_t r = (value >> 20) & 0x3ff;
         uint32_t g = (value >> 10) & 0x3ff;
         uint32_t b = (value) & 0x3ff;
         dst[0] = (float)(r * (1.0f/0x3ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x3ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x3ff)); /* b */
         dst[3] = (float)(a * (1.0f/0x3)); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = (value) & 0x3ff;
         uint32_t g = (value >> 10) & 0x3ff;
         uint32_t r = (value >> 20) & 0x3ff;
         uint32_t a = value >> 30;
         dst[0] = (float)(r * (1.0f/0x3ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x3ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x3ff)); /* b */
         dst[3] = (float)(a * (1.0f/0x3)); /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_b10g10r10a2_unorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0x3)) << 30;
         value |= (uint32_t)(((uint32_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x3ff)) & 0x3ff) << 20;
         value |= (uint32_t)(((uint32_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x3ff)) & 0x3ff) << 10;
         value |= ((uint32_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x3ff)) & 0x3ff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint32_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x3ff)) & 0x3ff;
         value |= (uint32_t)(((uint32_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x3ff)) & 0x3ff) << 10;
         value |= (uint32_t)(((uint32_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x3ff)) & 0x3ff) << 20;
         value |= (uint32_t)((uint32_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0x3)) << 30;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b10g10r10a2_unorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = value >> 30;
         uint32_t r = (value >> 20) & 0x3ff;
         uint32_t g = (value >> 10) & 0x3ff;
         uint32_t b = (value) & 0x3ff;
         dst[0] = (float)(r * (1.0f/0x3ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x3ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x3ff)); /* b */
         dst[3] = (float)(a * (1.0f/0x3)); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = (value) & 0x3ff;
         uint32_t g = (value >> 10) & 0x3ff;
         uint32_t r = (value >> 20) & 0x3ff;
         uint32_t a = value >> 30;
         dst[0] = (float)(r * (1.0f/0x3ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x3ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x3ff)); /* b */
         dst[3] = (float)(a * (1.0f/0x3)); /* a */
#endif
}

void
util_format_b10g10r10a2_unorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = value >> 30;
         uint32_t r = (value >> 20) & 0x3ff;
         uint32_t g = (value >> 10) & 0x3ff;
         uint32_t b = (value) & 0x3ff;
         dst[0] = _mesa_unorm_to_unorm(r, 10, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(g, 10, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(b, 10, 8); /* b */
         dst[3] = _mesa_unorm_to_unorm(a, 2, 8); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = (value) & 0x3ff;
         uint32_t g = (value >> 10) & 0x3ff;
         uint32_t r = (value >> 20) & 0x3ff;
         uint32_t a = value >> 30;
         dst[0] = _mesa_unorm_to_unorm(r, 10, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(g, 10, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(b, 10, 8); /* b */
         dst[3] = _mesa_unorm_to_unorm(a, 2, 8); /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_b10g10r10a2_unorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(_mesa_unorm_to_unorm(src[3], 8, 2)) << 30;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[0], 8, 10)) & 0x3ff) << 20;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[1], 8, 10)) & 0x3ff) << 10;
         value |= (_mesa_unorm_to_unorm(src[2], 8, 10)) & 0x3ff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (_mesa_unorm_to_unorm(src[2], 8, 10)) & 0x3ff;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[1], 8, 10)) & 0x3ff) << 10;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[0], 8, 10)) & 0x3ff) << 20;
         value |= (uint32_t)(_mesa_unorm_to_unorm(src[3], 8, 2)) << 30;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a2r10g10b10_unorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = value >> 22;
         uint32_t g = (value >> 12) & 0x3ff;
         uint32_t r = (value >> 2) & 0x3ff;
         uint32_t a = (value) & 0x3;
         dst[0] = (float)(r * (1.0f/0x3ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x3ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x3ff)); /* b */
         dst[3] = (float)(a * (1.0f/0x3)); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = (value) & 0x3;
         uint32_t r = (value >> 2) & 0x3ff;
         uint32_t g = (value >> 12) & 0x3ff;
         uint32_t b = value >> 22;
         dst[0] = (float)(r * (1.0f/0x3ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x3ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x3ff)); /* b */
         dst[3] = (float)(a * (1.0f/0x3)); /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_a2r10g10b10_unorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x3ff)) << 22;
         value |= (uint32_t)(((uint32_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x3ff)) & 0x3ff) << 12;
         value |= (uint32_t)(((uint32_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x3ff)) & 0x3ff) << 2;
         value |= ((uint32_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0x3)) & 0x3;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint32_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0x3)) & 0x3;
         value |= (uint32_t)(((uint32_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x3ff)) & 0x3ff) << 2;
         value |= (uint32_t)(((uint32_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x3ff)) & 0x3ff) << 12;
         value |= (uint32_t)((uint32_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x3ff)) << 22;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a2r10g10b10_unorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = value >> 22;
         uint32_t g = (value >> 12) & 0x3ff;
         uint32_t r = (value >> 2) & 0x3ff;
         uint32_t a = (value) & 0x3;
         dst[0] = (float)(r * (1.0f/0x3ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x3ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x3ff)); /* b */
         dst[3] = (float)(a * (1.0f/0x3)); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = (value) & 0x3;
         uint32_t r = (value >> 2) & 0x3ff;
         uint32_t g = (value >> 12) & 0x3ff;
         uint32_t b = value >> 22;
         dst[0] = (float)(r * (1.0f/0x3ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x3ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x3ff)); /* b */
         dst[3] = (float)(a * (1.0f/0x3)); /* a */
#endif
}

void
util_format_a2r10g10b10_unorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = value >> 22;
         uint32_t g = (value >> 12) & 0x3ff;
         uint32_t r = (value >> 2) & 0x3ff;
         uint32_t a = (value) & 0x3;
         dst[0] = _mesa_unorm_to_unorm(r, 10, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(g, 10, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(b, 10, 8); /* b */
         dst[3] = _mesa_unorm_to_unorm(a, 2, 8); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = (value) & 0x3;
         uint32_t r = (value >> 2) & 0x3ff;
         uint32_t g = (value >> 12) & 0x3ff;
         uint32_t b = value >> 22;
         dst[0] = _mesa_unorm_to_unorm(r, 10, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(g, 10, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(b, 10, 8); /* b */
         dst[3] = _mesa_unorm_to_unorm(a, 2, 8); /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_a2r10g10b10_unorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(_mesa_unorm_to_unorm(src[2], 8, 10)) << 22;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[1], 8, 10)) & 0x3ff) << 12;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[0], 8, 10)) & 0x3ff) << 2;
         value |= (_mesa_unorm_to_unorm(src[3], 8, 2)) & 0x3;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (_mesa_unorm_to_unorm(src[3], 8, 2)) & 0x3;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[0], 8, 10)) & 0x3ff) << 2;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[1], 8, 10)) & 0x3ff) << 12;
         value |= (uint32_t)(_mesa_unorm_to_unorm(src[2], 8, 10)) << 22;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a2b10g10r10_unorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = value >> 22;
         uint32_t g = (value >> 12) & 0x3ff;
         uint32_t b = (value >> 2) & 0x3ff;
         uint32_t a = (value) & 0x3;
         dst[0] = (float)(r * (1.0f/0x3ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x3ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x3ff)); /* b */
         dst[3] = (float)(a * (1.0f/0x3)); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = (value) & 0x3;
         uint32_t b = (value >> 2) & 0x3ff;
         uint32_t g = (value >> 12) & 0x3ff;
         uint32_t r = value >> 22;
         dst[0] = (float)(r * (1.0f/0x3ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x3ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x3ff)); /* b */
         dst[3] = (float)(a * (1.0f/0x3)); /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_a2b10g10r10_unorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x3ff)) << 22;
         value |= (uint32_t)(((uint32_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x3ff)) & 0x3ff) << 12;
         value |= (uint32_t)(((uint32_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x3ff)) & 0x3ff) << 2;
         value |= ((uint32_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0x3)) & 0x3;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint32_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0x3)) & 0x3;
         value |= (uint32_t)(((uint32_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x3ff)) & 0x3ff) << 2;
         value |= (uint32_t)(((uint32_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x3ff)) & 0x3ff) << 12;
         value |= (uint32_t)((uint32_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x3ff)) << 22;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a2b10g10r10_unorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = value >> 22;
         uint32_t g = (value >> 12) & 0x3ff;
         uint32_t b = (value >> 2) & 0x3ff;
         uint32_t a = (value) & 0x3;
         dst[0] = (float)(r * (1.0f/0x3ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x3ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x3ff)); /* b */
         dst[3] = (float)(a * (1.0f/0x3)); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = (value) & 0x3;
         uint32_t b = (value >> 2) & 0x3ff;
         uint32_t g = (value >> 12) & 0x3ff;
         uint32_t r = value >> 22;
         dst[0] = (float)(r * (1.0f/0x3ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x3ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x3ff)); /* b */
         dst[3] = (float)(a * (1.0f/0x3)); /* a */
#endif
}

void
util_format_a2b10g10r10_unorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = value >> 22;
         uint32_t g = (value >> 12) & 0x3ff;
         uint32_t b = (value >> 2) & 0x3ff;
         uint32_t a = (value) & 0x3;
         dst[0] = _mesa_unorm_to_unorm(r, 10, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(g, 10, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(b, 10, 8); /* b */
         dst[3] = _mesa_unorm_to_unorm(a, 2, 8); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = (value) & 0x3;
         uint32_t b = (value >> 2) & 0x3ff;
         uint32_t g = (value >> 12) & 0x3ff;
         uint32_t r = value >> 22;
         dst[0] = _mesa_unorm_to_unorm(r, 10, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(g, 10, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(b, 10, 8); /* b */
         dst[3] = _mesa_unorm_to_unorm(a, 2, 8); /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_a2b10g10r10_unorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(_mesa_unorm_to_unorm(src[0], 8, 10)) << 22;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[1], 8, 10)) & 0x3ff) << 12;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[2], 8, 10)) & 0x3ff) << 2;
         value |= (_mesa_unorm_to_unorm(src[3], 8, 2)) & 0x3;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (_mesa_unorm_to_unorm(src[3], 8, 2)) & 0x3;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[2], 8, 10)) & 0x3ff) << 2;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[1], 8, 10)) & 0x3ff) << 12;
         value |= (uint32_t)(_mesa_unorm_to_unorm(src[0], 8, 10)) << 22;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r3g3b2_unorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint8_t value = *(const uint8_t *)src;
         uint8_t b = value >> 6;
         uint8_t g = (value >> 3) & 0x7;
         uint8_t r = (value) & 0x7;
         dst[0] = (float)(r * (1.0f/0x7)); /* r */
         dst[1] = (float)(g * (1.0f/0x7)); /* g */
         dst[2] = (float)(b * (1.0f/0x3)); /* b */
         dst[3] = 1; /* a */
#else
         uint8_t value = *(const uint8_t *)src;
         uint8_t r = (value) & 0x7;
         uint8_t g = (value >> 3) & 0x7;
         uint8_t b = value >> 6;
         dst[0] = (float)(r * (1.0f/0x7)); /* r */
         dst[1] = (float)(g * (1.0f/0x7)); /* g */
         dst[2] = (float)(b * (1.0f/0x3)); /* b */
         dst[3] = 1; /* a */
#endif
      src += 1;
      dst += 4;
   }
}

void
util_format_r3g3b2_unorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint8_t value = 0;
         value |= (uint32_t)((uint8_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x3)) << 6;
         value |= (uint32_t)(((uint8_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x7)) & 0x7) << 3;
         value |= ((uint8_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x7)) & 0x7;
         *(uint8_t *)dst = value;
#else
         uint8_t value = 0;
         value |= ((uint8_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x7)) & 0x7;
         value |= (uint32_t)(((uint8_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x7)) & 0x7) << 3;
         value |= (uint32_t)((uint8_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x3)) << 6;
         *(uint8_t *)dst = value;
#endif
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r3g3b2_unorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint8_t value = *(const uint8_t *)src;
         uint8_t b = value >> 6;
         uint8_t g = (value >> 3) & 0x7;
         uint8_t r = (value) & 0x7;
         dst[0] = (float)(r * (1.0f/0x7)); /* r */
         dst[1] = (float)(g * (1.0f/0x7)); /* g */
         dst[2] = (float)(b * (1.0f/0x3)); /* b */
         dst[3] = 1; /* a */
#else
         uint8_t value = *(const uint8_t *)src;
         uint8_t r = (value) & 0x7;
         uint8_t g = (value >> 3) & 0x7;
         uint8_t b = value >> 6;
         dst[0] = (float)(r * (1.0f/0x7)); /* r */
         dst[1] = (float)(g * (1.0f/0x7)); /* g */
         dst[2] = (float)(b * (1.0f/0x3)); /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_r3g3b2_unorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint8_t value = *(const uint8_t *)src;
         uint8_t b = value >> 6;
         uint8_t g = (value >> 3) & 0x7;
         uint8_t r = (value) & 0x7;
         dst[0] = _mesa_unorm_to_unorm(r, 3, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(g, 3, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(b, 2, 8); /* b */
         dst[3] = 255; /* a */
#else
         uint8_t value = *(const uint8_t *)src;
         uint8_t r = (value) & 0x7;
         uint8_t g = (value >> 3) & 0x7;
         uint8_t b = value >> 6;
         dst[0] = _mesa_unorm_to_unorm(r, 3, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(g, 3, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(b, 2, 8); /* b */
         dst[3] = 255; /* a */
#endif
      src += 1;
      dst += 4;
   }
}

void
util_format_r3g3b2_unorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint8_t value = 0;
         value |= (uint32_t)(_mesa_unorm_to_unorm(src[2], 8, 2)) << 6;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[1], 8, 3)) & 0x7) << 3;
         value |= (_mesa_unorm_to_unorm(src[0], 8, 3)) & 0x7;
         *(uint8_t *)dst = value;
#else
         uint8_t value = 0;
         value |= (_mesa_unorm_to_unorm(src[0], 8, 3)) & 0x7;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[1], 8, 3)) & 0x7) << 3;
         value |= (uint32_t)(_mesa_unorm_to_unorm(src[2], 8, 2)) << 6;
         *(uint8_t *)dst = value;
#endif
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b2g3r3_unorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint8_t value = *(const uint8_t *)src;
         uint8_t r = value >> 5;
         uint8_t g = (value >> 2) & 0x7;
         uint8_t b = (value) & 0x3;
         dst[0] = (float)(r * (1.0f/0x7)); /* r */
         dst[1] = (float)(g * (1.0f/0x7)); /* g */
         dst[2] = (float)(b * (1.0f/0x3)); /* b */
         dst[3] = 1; /* a */
#else
         uint8_t value = *(const uint8_t *)src;
         uint8_t b = (value) & 0x3;
         uint8_t g = (value >> 2) & 0x7;
         uint8_t r = value >> 5;
         dst[0] = (float)(r * (1.0f/0x7)); /* r */
         dst[1] = (float)(g * (1.0f/0x7)); /* g */
         dst[2] = (float)(b * (1.0f/0x3)); /* b */
         dst[3] = 1; /* a */
#endif
      src += 1;
      dst += 4;
   }
}

void
util_format_b2g3r3_unorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint8_t value = 0;
         value |= (uint32_t)((uint8_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x7)) << 5;
         value |= (uint32_t)(((uint8_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x7)) & 0x7) << 2;
         value |= ((uint8_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x3)) & 0x3;
         *(uint8_t *)dst = value;
#else
         uint8_t value = 0;
         value |= ((uint8_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x3)) & 0x3;
         value |= (uint32_t)(((uint8_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x7)) & 0x7) << 2;
         value |= (uint32_t)((uint8_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x7)) << 5;
         *(uint8_t *)dst = value;
#endif
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b2g3r3_unorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint8_t value = *(const uint8_t *)src;
         uint8_t r = value >> 5;
         uint8_t g = (value >> 2) & 0x7;
         uint8_t b = (value) & 0x3;
         dst[0] = (float)(r * (1.0f/0x7)); /* r */
         dst[1] = (float)(g * (1.0f/0x7)); /* g */
         dst[2] = (float)(b * (1.0f/0x3)); /* b */
         dst[3] = 1; /* a */
#else
         uint8_t value = *(const uint8_t *)src;
         uint8_t b = (value) & 0x3;
         uint8_t g = (value >> 2) & 0x7;
         uint8_t r = value >> 5;
         dst[0] = (float)(r * (1.0f/0x7)); /* r */
         dst[1] = (float)(g * (1.0f/0x7)); /* g */
         dst[2] = (float)(b * (1.0f/0x3)); /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_b2g3r3_unorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint8_t value = *(const uint8_t *)src;
         uint8_t r = value >> 5;
         uint8_t g = (value >> 2) & 0x7;
         uint8_t b = (value) & 0x3;
         dst[0] = _mesa_unorm_to_unorm(r, 3, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(g, 3, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(b, 2, 8); /* b */
         dst[3] = 255; /* a */
#else
         uint8_t value = *(const uint8_t *)src;
         uint8_t b = (value) & 0x3;
         uint8_t g = (value >> 2) & 0x7;
         uint8_t r = value >> 5;
         dst[0] = _mesa_unorm_to_unorm(r, 3, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(g, 3, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(b, 2, 8); /* b */
         dst[3] = 255; /* a */
#endif
      src += 1;
      dst += 4;
   }
}

void
util_format_b2g3r3_unorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint8_t value = 0;
         value |= (uint32_t)(_mesa_unorm_to_unorm(src[0], 8, 3)) << 5;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[1], 8, 3)) & 0x7) << 2;
         value |= (_mesa_unorm_to_unorm(src[2], 8, 2)) & 0x3;
         *(uint8_t *)dst = value;
#else
         uint8_t value = 0;
         value |= (_mesa_unorm_to_unorm(src[2], 8, 2)) & 0x3;
         value |= (uint32_t)((_mesa_unorm_to_unorm(src[1], 8, 3)) & 0x7) << 2;
         value |= (uint32_t)(_mesa_unorm_to_unorm(src[0], 8, 3)) << 5;
         *(uint8_t *)dst = value;
#endif
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_l8_unorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         uint8_t rgb = value;
         dst[0] = ubyte_to_float(rgb); /* r */
         dst[1] = ubyte_to_float(rgb); /* g */
         dst[2] = ubyte_to_float(rgb); /* b */
         dst[3] = 1; /* a */
      src += 1;
      dst += 4;
   }
}

void
util_format_l8_unorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= float_to_ubyte(src[0]);
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_l8_unorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
         uint8_t value = *(const uint8_t *)src;
         uint8_t rgb = value;
         dst[0] = ubyte_to_float(rgb); /* r */
         dst[1] = ubyte_to_float(rgb); /* g */
         dst[2] = ubyte_to_float(rgb); /* b */
         dst[3] = 1; /* a */
}

void
util_format_l8_unorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         uint8_t rgb = value;
         dst[0] = rgb; /* r */
         dst[1] = rgb; /* g */
         dst[2] = rgb; /* b */
         dst[3] = 255; /* a */
      src += 1;
      dst += 4;
   }
}

void
util_format_l8_unorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= src[0];
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a8_unorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         uint8_t a = value;
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = ubyte_to_float(a); /* a */
      src += 1;
      dst += 4;
   }
}

void
util_format_a8_unorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= float_to_ubyte(src[3]);
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a8_unorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
         uint8_t value = *(const uint8_t *)src;
         uint8_t a = value;
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = ubyte_to_float(a); /* a */
}

void
util_format_a8_unorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         uint8_t a = value;
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = a; /* a */
      src += 1;
      dst += 4;
   }
}

void
util_format_a8_unorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= src[3];
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_i8_unorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         uint8_t rgba = value;
         dst[0] = ubyte_to_float(rgba); /* r */
         dst[1] = ubyte_to_float(rgba); /* g */
         dst[2] = ubyte_to_float(rgba); /* b */
         dst[3] = ubyte_to_float(rgba); /* a */
      src += 1;
      dst += 4;
   }
}

void
util_format_i8_unorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= float_to_ubyte(src[0]);
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_i8_unorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
         uint8_t value = *(const uint8_t *)src;
         uint8_t rgba = value;
         dst[0] = ubyte_to_float(rgba); /* r */
         dst[1] = ubyte_to_float(rgba); /* g */
         dst[2] = ubyte_to_float(rgba); /* b */
         dst[3] = ubyte_to_float(rgba); /* a */
}

void
util_format_i8_unorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         uint8_t rgba = value;
         dst[0] = rgba; /* r */
         dst[1] = rgba; /* g */
         dst[2] = rgba; /* b */
         dst[3] = rgba; /* a */
      src += 1;
      dst += 4;
   }
}

void
util_format_i8_unorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= src[0];
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_l4a4_unorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint8_t value = *(const uint8_t *)src;
         uint8_t a = value >> 4;
         uint8_t rgb = (value) & 0xf;
         dst[0] = (float)(rgb * (1.0f/0xf)); /* r */
         dst[1] = (float)(rgb * (1.0f/0xf)); /* g */
         dst[2] = (float)(rgb * (1.0f/0xf)); /* b */
         dst[3] = (float)(a * (1.0f/0xf)); /* a */
#else
         uint8_t value = *(const uint8_t *)src;
         uint8_t rgb = (value) & 0xf;
         uint8_t a = value >> 4;
         dst[0] = (float)(rgb * (1.0f/0xf)); /* r */
         dst[1] = (float)(rgb * (1.0f/0xf)); /* g */
         dst[2] = (float)(rgb * (1.0f/0xf)); /* b */
         dst[3] = (float)(a * (1.0f/0xf)); /* a */
#endif
      src += 1;
      dst += 4;
   }
}

void
util_format_l4a4_unorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint8_t value = 0;
         value |= (uint32_t)((uint8_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0xf)) << 4;
         value |= ((uint8_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0xf)) & 0xf;
         *(uint8_t *)dst = value;
#else
         uint8_t value = 0;
         value |= ((uint8_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0xf)) & 0xf;
         value |= (uint32_t)((uint8_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0xf)) << 4;
         *(uint8_t *)dst = value;
#endif
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_l4a4_unorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint8_t value = *(const uint8_t *)src;
         uint8_t a = value >> 4;
         uint8_t rgb = (value) & 0xf;
         dst[0] = (float)(rgb * (1.0f/0xf)); /* r */
         dst[1] = (float)(rgb * (1.0f/0xf)); /* g */
         dst[2] = (float)(rgb * (1.0f/0xf)); /* b */
         dst[3] = (float)(a * (1.0f/0xf)); /* a */
#else
         uint8_t value = *(const uint8_t *)src;
         uint8_t rgb = (value) & 0xf;
         uint8_t a = value >> 4;
         dst[0] = (float)(rgb * (1.0f/0xf)); /* r */
         dst[1] = (float)(rgb * (1.0f/0xf)); /* g */
         dst[2] = (float)(rgb * (1.0f/0xf)); /* b */
         dst[3] = (float)(a * (1.0f/0xf)); /* a */
#endif
}

void
util_format_l4a4_unorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint8_t value = *(const uint8_t *)src;
         uint8_t a = value >> 4;
         uint8_t rgb = (value) & 0xf;
         dst[0] = _mesa_unorm_to_unorm(rgb, 4, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(rgb, 4, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(rgb, 4, 8); /* b */
         dst[3] = _mesa_unorm_to_unorm(a, 4, 8); /* a */
#else
         uint8_t value = *(const uint8_t *)src;
         uint8_t rgb = (value) & 0xf;
         uint8_t a = value >> 4;
         dst[0] = _mesa_unorm_to_unorm(rgb, 4, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(rgb, 4, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(rgb, 4, 8); /* b */
         dst[3] = _mesa_unorm_to_unorm(a, 4, 8); /* a */
#endif
      src += 1;
      dst += 4;
   }
}

void
util_format_l4a4_unorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint8_t value = 0;
         value |= (uint32_t)(_mesa_unorm_to_unorm(src[3], 8, 4)) << 4;
         value |= (_mesa_unorm_to_unorm(src[0], 8, 4)) & 0xf;
         *(uint8_t *)dst = value;
#else
         uint8_t value = 0;
         value |= (_mesa_unorm_to_unorm(src[0], 8, 4)) & 0xf;
         value |= (uint32_t)(_mesa_unorm_to_unorm(src[3], 8, 4)) << 4;
         *(uint8_t *)dst = value;
#endif
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_l8a8_unorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t rgb = value >> 8;
         uint16_t a = (value) & 0xff;
         dst[0] = ubyte_to_float(rgb); /* r */
         dst[1] = ubyte_to_float(rgb); /* g */
         dst[2] = ubyte_to_float(rgb); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t rgb = (value) & 0xff;
         uint16_t a = value >> 8;
         dst[0] = ubyte_to_float(rgb); /* r */
         dst[1] = ubyte_to_float(rgb); /* g */
         dst[2] = ubyte_to_float(rgb); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_l8a8_unorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)(float_to_ubyte(src[0])) << 8;
         value |= (float_to_ubyte(src[3])) & 0xff;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (float_to_ubyte(src[0])) & 0xff;
         value |= (uint32_t)(float_to_ubyte(src[3])) << 8;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_l8a8_unorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t rgb = value >> 8;
         uint16_t a = (value) & 0xff;
         dst[0] = ubyte_to_float(rgb); /* r */
         dst[1] = ubyte_to_float(rgb); /* g */
         dst[2] = ubyte_to_float(rgb); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t rgb = (value) & 0xff;
         uint16_t a = value >> 8;
         dst[0] = ubyte_to_float(rgb); /* r */
         dst[1] = ubyte_to_float(rgb); /* g */
         dst[2] = ubyte_to_float(rgb); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#endif
}

void
util_format_l8a8_unorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t rgb = value >> 8;
         uint16_t a = (value) & 0xff;
         dst[0] = rgb; /* r */
         dst[1] = rgb; /* g */
         dst[2] = rgb; /* b */
         dst[3] = a; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t rgb = (value) & 0xff;
         uint16_t a = value >> 8;
         dst[0] = rgb; /* r */
         dst[1] = rgb; /* g */
         dst[2] = rgb; /* b */
         dst[3] = a; /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_l8a8_unorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)(src[0]) << 8;
         value |= (src[3]) & 0xff;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (src[0]) & 0xff;
         value |= (uint32_t)(src[3]) << 8;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_l16_unorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         uint16_t rgb = value;
         dst[0] = (float)(rgb * (1.0f/0xffff)); /* r */
         dst[1] = (float)(rgb * (1.0f/0xffff)); /* g */
         dst[2] = (float)(rgb * (1.0f/0xffff)); /* b */
         dst[3] = 1; /* a */
      src += 2;
      dst += 4;
   }
}

void
util_format_l16_unorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0xffff);
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_l16_unorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
         uint16_t value = *(const uint16_t *)src;
         uint16_t rgb = value;
         dst[0] = (float)(rgb * (1.0f/0xffff)); /* r */
         dst[1] = (float)(rgb * (1.0f/0xffff)); /* g */
         dst[2] = (float)(rgb * (1.0f/0xffff)); /* b */
         dst[3] = 1; /* a */
}

void
util_format_l16_unorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         uint16_t rgb = value;
         dst[0] = _mesa_unorm_to_unorm(rgb, 16, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(rgb, 16, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(rgb, 16, 8); /* b */
         dst[3] = 255; /* a */
      src += 2;
      dst += 4;
   }
}

void
util_format_l16_unorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= _mesa_unorm_to_unorm(src[0], 8, 16);
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a16_unorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         uint16_t a = value;
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = (float)(a * (1.0f/0xffff)); /* a */
      src += 2;
      dst += 4;
   }
}

void
util_format_a16_unorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0xffff);
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a16_unorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
         uint16_t value = *(const uint16_t *)src;
         uint16_t a = value;
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = (float)(a * (1.0f/0xffff)); /* a */
}

void
util_format_a16_unorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         uint16_t a = value;
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = _mesa_unorm_to_unorm(a, 16, 8); /* a */
      src += 2;
      dst += 4;
   }
}

void
util_format_a16_unorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= _mesa_unorm_to_unorm(src[3], 8, 16);
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_i16_unorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         uint16_t rgba = value;
         dst[0] = (float)(rgba * (1.0f/0xffff)); /* r */
         dst[1] = (float)(rgba * (1.0f/0xffff)); /* g */
         dst[2] = (float)(rgba * (1.0f/0xffff)); /* b */
         dst[3] = (float)(rgba * (1.0f/0xffff)); /* a */
      src += 2;
      dst += 4;
   }
}

void
util_format_i16_unorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0xffff);
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_i16_unorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
         uint16_t value = *(const uint16_t *)src;
         uint16_t rgba = value;
         dst[0] = (float)(rgba * (1.0f/0xffff)); /* r */
         dst[1] = (float)(rgba * (1.0f/0xffff)); /* g */
         dst[2] = (float)(rgba * (1.0f/0xffff)); /* b */
         dst[3] = (float)(rgba * (1.0f/0xffff)); /* a */
}

void
util_format_i16_unorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         uint16_t rgba = value;
         dst[0] = _mesa_unorm_to_unorm(rgba, 16, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(rgba, 16, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(rgba, 16, 8); /* b */
         dst[3] = _mesa_unorm_to_unorm(rgba, 16, 8); /* a */
      src += 2;
      dst += 4;
   }
}

void
util_format_i16_unorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= _mesa_unorm_to_unorm(src[0], 8, 16);
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_l16a16_unorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t rgb = value >> 16;
         uint32_t a = (value) & 0xffff;
         dst[0] = (float)(rgb * (1.0f/0xffff)); /* r */
         dst[1] = (float)(rgb * (1.0f/0xffff)); /* g */
         dst[2] = (float)(rgb * (1.0f/0xffff)); /* b */
         dst[3] = (float)(a * (1.0f/0xffff)); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t rgb = (value) & 0xffff;
         uint32_t a = value >> 16;
         dst[0] = (float)(rgb * (1.0f/0xffff)); /* r */
         dst[1] = (float)(rgb * (1.0f/0xffff)); /* g */
         dst[2] = (float)(rgb * (1.0f/0xffff)); /* b */
         dst[3] = (float)(a * (1.0f/0xffff)); /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_l16a16_unorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0xffff)) << 16;
         value |= ((uint16_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0xffff)) & 0xffff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0xffff)) & 0xffff;
         value |= (uint32_t)((uint16_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0xffff)) << 16;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_l16a16_unorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t rgb = value >> 16;
         uint32_t a = (value) & 0xffff;
         dst[0] = (float)(rgb * (1.0f/0xffff)); /* r */
         dst[1] = (float)(rgb * (1.0f/0xffff)); /* g */
         dst[2] = (float)(rgb * (1.0f/0xffff)); /* b */
         dst[3] = (float)(a * (1.0f/0xffff)); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t rgb = (value) & 0xffff;
         uint32_t a = value >> 16;
         dst[0] = (float)(rgb * (1.0f/0xffff)); /* r */
         dst[1] = (float)(rgb * (1.0f/0xffff)); /* g */
         dst[2] = (float)(rgb * (1.0f/0xffff)); /* b */
         dst[3] = (float)(a * (1.0f/0xffff)); /* a */
#endif
}

void
util_format_l16a16_unorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t rgb = value >> 16;
         uint32_t a = (value) & 0xffff;
         dst[0] = _mesa_unorm_to_unorm(rgb, 16, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(rgb, 16, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(rgb, 16, 8); /* b */
         dst[3] = _mesa_unorm_to_unorm(a, 16, 8); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t rgb = (value) & 0xffff;
         uint32_t a = value >> 16;
         dst[0] = _mesa_unorm_to_unorm(rgb, 16, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(rgb, 16, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(rgb, 16, 8); /* b */
         dst[3] = _mesa_unorm_to_unorm(a, 16, 8); /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_l16a16_unorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(_mesa_unorm_to_unorm(src[0], 8, 16)) << 16;
         value |= (_mesa_unorm_to_unorm(src[3], 8, 16)) & 0xffff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (_mesa_unorm_to_unorm(src[0], 8, 16)) & 0xffff;
         value |= (uint32_t)(_mesa_unorm_to_unorm(src[3], 8, 16)) << 16;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a8_snorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         int8_t a = (int8_t)(value) ;
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = (float)(a * (1.0f/0x7f)); /* a */
      src += 1;
      dst += 4;
   }
}

void
util_format_a8_snorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= (uint8_t)((int8_t)util_iround(CLAMP(src[3], -1.0f, 1.0f) * 0x7f)) ;
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a8_snorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
         uint8_t value = *(const uint8_t *)src;
         int8_t a = (int8_t)(value) ;
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = (float)(a * (1.0f/0x7f)); /* a */
}

void
util_format_a8_snorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         int8_t a = (int8_t)(value) ;
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = _mesa_snorm_to_unorm(MAX2(a, 0), 8, 8); /* a */
      src += 1;
      dst += 4;
   }
}

void
util_format_a8_snorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= (uint8_t)(_mesa_unorm_to_snorm(src[3], 8, 8)) ;
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_l8_snorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         int8_t rgb = (int8_t)(value) ;
         dst[0] = (float)(rgb * (1.0f/0x7f)); /* r */
         dst[1] = (float)(rgb * (1.0f/0x7f)); /* g */
         dst[2] = (float)(rgb * (1.0f/0x7f)); /* b */
         dst[3] = 1; /* a */
      src += 1;
      dst += 4;
   }
}

void
util_format_l8_snorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= (uint8_t)((int8_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7f)) ;
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_l8_snorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
         uint8_t value = *(const uint8_t *)src;
         int8_t rgb = (int8_t)(value) ;
         dst[0] = (float)(rgb * (1.0f/0x7f)); /* r */
         dst[1] = (float)(rgb * (1.0f/0x7f)); /* g */
         dst[2] = (float)(rgb * (1.0f/0x7f)); /* b */
         dst[3] = 1; /* a */
}

void
util_format_l8_snorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         int8_t rgb = (int8_t)(value) ;
         dst[0] = _mesa_snorm_to_unorm(MAX2(rgb, 0), 8, 8); /* r */
         dst[1] = _mesa_snorm_to_unorm(MAX2(rgb, 0), 8, 8); /* g */
         dst[2] = _mesa_snorm_to_unorm(MAX2(rgb, 0), 8, 8); /* b */
         dst[3] = 255; /* a */
      src += 1;
      dst += 4;
   }
}

void
util_format_l8_snorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= (uint8_t)(_mesa_unorm_to_snorm(src[0], 8, 8)) ;
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_l8a8_snorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         int16_t rgb = ((int16_t)(value) ) >> 8;
         int16_t a = ((int16_t)(value << 8) ) >> 8;
         dst[0] = (float)(rgb * (1.0f/0x7f)); /* r */
         dst[1] = (float)(rgb * (1.0f/0x7f)); /* g */
         dst[2] = (float)(rgb * (1.0f/0x7f)); /* b */
         dst[3] = (float)(a * (1.0f/0x7f)); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         int16_t rgb = ((int16_t)(value << 8) ) >> 8;
         int16_t a = ((int16_t)(value) ) >> 8;
         dst[0] = (float)(rgb * (1.0f/0x7f)); /* r */
         dst[1] = (float)(rgb * (1.0f/0x7f)); /* g */
         dst[2] = (float)(rgb * (1.0f/0x7f)); /* b */
         dst[3] = (float)(a * (1.0f/0x7f)); /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_l8a8_snorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint16_t)((uint32_t)((int8_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7f)) << 8) ;
         value |= (uint16_t)(((int8_t)util_iround(CLAMP(src[3], -1.0f, 1.0f) * 0x7f)) & 0xff) ;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (uint16_t)(((int8_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7f)) & 0xff) ;
         value |= (uint16_t)((uint32_t)((int8_t)util_iround(CLAMP(src[3], -1.0f, 1.0f) * 0x7f)) << 8) ;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_l8a8_snorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         int16_t rgb = ((int16_t)(value) ) >> 8;
         int16_t a = ((int16_t)(value << 8) ) >> 8;
         dst[0] = (float)(rgb * (1.0f/0x7f)); /* r */
         dst[1] = (float)(rgb * (1.0f/0x7f)); /* g */
         dst[2] = (float)(rgb * (1.0f/0x7f)); /* b */
         dst[3] = (float)(a * (1.0f/0x7f)); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         int16_t rgb = ((int16_t)(value << 8) ) >> 8;
         int16_t a = ((int16_t)(value) ) >> 8;
         dst[0] = (float)(rgb * (1.0f/0x7f)); /* r */
         dst[1] = (float)(rgb * (1.0f/0x7f)); /* g */
         dst[2] = (float)(rgb * (1.0f/0x7f)); /* b */
         dst[3] = (float)(a * (1.0f/0x7f)); /* a */
#endif
}

void
util_format_l8a8_snorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         int16_t rgb = ((int16_t)(value) ) >> 8;
         int16_t a = ((int16_t)(value << 8) ) >> 8;
         dst[0] = _mesa_snorm_to_unorm(MAX2(rgb, 0), 8, 8); /* r */
         dst[1] = _mesa_snorm_to_unorm(MAX2(rgb, 0), 8, 8); /* g */
         dst[2] = _mesa_snorm_to_unorm(MAX2(rgb, 0), 8, 8); /* b */
         dst[3] = _mesa_snorm_to_unorm(MAX2(a, 0), 8, 8); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         int16_t rgb = ((int16_t)(value << 8) ) >> 8;
         int16_t a = ((int16_t)(value) ) >> 8;
         dst[0] = _mesa_snorm_to_unorm(MAX2(rgb, 0), 8, 8); /* r */
         dst[1] = _mesa_snorm_to_unorm(MAX2(rgb, 0), 8, 8); /* g */
         dst[2] = _mesa_snorm_to_unorm(MAX2(rgb, 0), 8, 8); /* b */
         dst[3] = _mesa_snorm_to_unorm(MAX2(a, 0), 8, 8); /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_l8a8_snorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint16_t)((uint32_t)(_mesa_unorm_to_snorm(src[0], 8, 8)) << 8) ;
         value |= (uint16_t)((_mesa_unorm_to_snorm(src[3], 8, 8)) & 0xff) ;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (uint16_t)((_mesa_unorm_to_snorm(src[0], 8, 8)) & 0xff) ;
         value |= (uint16_t)((uint32_t)(_mesa_unorm_to_snorm(src[3], 8, 8)) << 8) ;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_i8_snorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         int8_t rgba = (int8_t)(value) ;
         dst[0] = (float)(rgba * (1.0f/0x7f)); /* r */
         dst[1] = (float)(rgba * (1.0f/0x7f)); /* g */
         dst[2] = (float)(rgba * (1.0f/0x7f)); /* b */
         dst[3] = (float)(rgba * (1.0f/0x7f)); /* a */
      src += 1;
      dst += 4;
   }
}

void
util_format_i8_snorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= (uint8_t)((int8_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7f)) ;
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_i8_snorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
         uint8_t value = *(const uint8_t *)src;
         int8_t rgba = (int8_t)(value) ;
         dst[0] = (float)(rgba * (1.0f/0x7f)); /* r */
         dst[1] = (float)(rgba * (1.0f/0x7f)); /* g */
         dst[2] = (float)(rgba * (1.0f/0x7f)); /* b */
         dst[3] = (float)(rgba * (1.0f/0x7f)); /* a */
}

void
util_format_i8_snorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         int8_t rgba = (int8_t)(value) ;
         dst[0] = _mesa_snorm_to_unorm(MAX2(rgba, 0), 8, 8); /* r */
         dst[1] = _mesa_snorm_to_unorm(MAX2(rgba, 0), 8, 8); /* g */
         dst[2] = _mesa_snorm_to_unorm(MAX2(rgba, 0), 8, 8); /* b */
         dst[3] = _mesa_snorm_to_unorm(MAX2(rgba, 0), 8, 8); /* a */
      src += 1;
      dst += 4;
   }
}

void
util_format_i8_snorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= (uint8_t)(_mesa_unorm_to_snorm(src[0], 8, 8)) ;
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a16_snorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         int16_t a = (int16_t)(value) ;
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = (float)(a * (1.0f/0x7fff)); /* a */
      src += 2;
      dst += 4;
   }
}

void
util_format_a16_snorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)((int16_t)util_iround(CLAMP(src[3], -1.0f, 1.0f) * 0x7fff)) ;
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a16_snorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
         uint16_t value = *(const uint16_t *)src;
         int16_t a = (int16_t)(value) ;
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = (float)(a * (1.0f/0x7fff)); /* a */
}

void
util_format_a16_snorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         int16_t a = (int16_t)(value) ;
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = _mesa_snorm_to_unorm(MAX2(a, 0), 16, 8); /* a */
      src += 2;
      dst += 4;
   }
}

void
util_format_a16_snorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)(_mesa_unorm_to_snorm(src[3], 8, 16)) ;
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_l16_snorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         int16_t rgb = (int16_t)(value) ;
         dst[0] = (float)(rgb * (1.0f/0x7fff)); /* r */
         dst[1] = (float)(rgb * (1.0f/0x7fff)); /* g */
         dst[2] = (float)(rgb * (1.0f/0x7fff)); /* b */
         dst[3] = 1; /* a */
      src += 2;
      dst += 4;
   }
}

void
util_format_l16_snorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)((int16_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7fff)) ;
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_l16_snorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
         uint16_t value = *(const uint16_t *)src;
         int16_t rgb = (int16_t)(value) ;
         dst[0] = (float)(rgb * (1.0f/0x7fff)); /* r */
         dst[1] = (float)(rgb * (1.0f/0x7fff)); /* g */
         dst[2] = (float)(rgb * (1.0f/0x7fff)); /* b */
         dst[3] = 1; /* a */
}

void
util_format_l16_snorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         int16_t rgb = (int16_t)(value) ;
         dst[0] = _mesa_snorm_to_unorm(MAX2(rgb, 0), 16, 8); /* r */
         dst[1] = _mesa_snorm_to_unorm(MAX2(rgb, 0), 16, 8); /* g */
         dst[2] = _mesa_snorm_to_unorm(MAX2(rgb, 0), 16, 8); /* b */
         dst[3] = 255; /* a */
      src += 2;
      dst += 4;
   }
}

void
util_format_l16_snorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)(_mesa_unorm_to_snorm(src[0], 8, 16)) ;
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_l16a16_snorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t rgb = ((int32_t)(value) ) >> 16;
         int32_t a = ((int32_t)(value << 16) ) >> 16;
         dst[0] = (float)(rgb * (1.0f/0x7fff)); /* r */
         dst[1] = (float)(rgb * (1.0f/0x7fff)); /* g */
         dst[2] = (float)(rgb * (1.0f/0x7fff)); /* b */
         dst[3] = (float)(a * (1.0f/0x7fff)); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t rgb = ((int32_t)(value << 16) ) >> 16;
         int32_t a = ((int32_t)(value) ) >> 16;
         dst[0] = (float)(rgb * (1.0f/0x7fff)); /* r */
         dst[1] = (float)(rgb * (1.0f/0x7fff)); /* g */
         dst[2] = (float)(rgb * (1.0f/0x7fff)); /* b */
         dst[3] = (float)(a * (1.0f/0x7fff)); /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_l16a16_snorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((int16_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7fff)) << 16) ;
         value |= (uint32_t)(((int16_t)util_iround(CLAMP(src[3], -1.0f, 1.0f) * 0x7fff)) & 0xffff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((int16_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7fff)) & 0xffff) ;
         value |= (uint32_t)((uint32_t)((int16_t)util_iround(CLAMP(src[3], -1.0f, 1.0f) * 0x7fff)) << 16) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_l16a16_snorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t rgb = ((int32_t)(value) ) >> 16;
         int32_t a = ((int32_t)(value << 16) ) >> 16;
         dst[0] = (float)(rgb * (1.0f/0x7fff)); /* r */
         dst[1] = (float)(rgb * (1.0f/0x7fff)); /* g */
         dst[2] = (float)(rgb * (1.0f/0x7fff)); /* b */
         dst[3] = (float)(a * (1.0f/0x7fff)); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t rgb = ((int32_t)(value << 16) ) >> 16;
         int32_t a = ((int32_t)(value) ) >> 16;
         dst[0] = (float)(rgb * (1.0f/0x7fff)); /* r */
         dst[1] = (float)(rgb * (1.0f/0x7fff)); /* g */
         dst[2] = (float)(rgb * (1.0f/0x7fff)); /* b */
         dst[3] = (float)(a * (1.0f/0x7fff)); /* a */
#endif
}

void
util_format_l16a16_snorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t rgb = ((int32_t)(value) ) >> 16;
         int32_t a = ((int32_t)(value << 16) ) >> 16;
         dst[0] = _mesa_snorm_to_unorm(MAX2(rgb, 0), 16, 8); /* r */
         dst[1] = _mesa_snorm_to_unorm(MAX2(rgb, 0), 16, 8); /* g */
         dst[2] = _mesa_snorm_to_unorm(MAX2(rgb, 0), 16, 8); /* b */
         dst[3] = _mesa_snorm_to_unorm(MAX2(a, 0), 16, 8); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t rgb = ((int32_t)(value << 16) ) >> 16;
         int32_t a = ((int32_t)(value) ) >> 16;
         dst[0] = _mesa_snorm_to_unorm(MAX2(rgb, 0), 16, 8); /* r */
         dst[1] = _mesa_snorm_to_unorm(MAX2(rgb, 0), 16, 8); /* g */
         dst[2] = _mesa_snorm_to_unorm(MAX2(rgb, 0), 16, 8); /* b */
         dst[3] = _mesa_snorm_to_unorm(MAX2(a, 0), 16, 8); /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_l16a16_snorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)(_mesa_unorm_to_snorm(src[0], 8, 16)) << 16) ;
         value |= (uint32_t)((_mesa_unorm_to_snorm(src[3], 8, 16)) & 0xffff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)((_mesa_unorm_to_snorm(src[0], 8, 16)) & 0xffff) ;
         value |= (uint32_t)((uint32_t)(_mesa_unorm_to_snorm(src[3], 8, 16)) << 16) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_i16_snorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         int16_t rgba = (int16_t)(value) ;
         dst[0] = (float)(rgba * (1.0f/0x7fff)); /* r */
         dst[1] = (float)(rgba * (1.0f/0x7fff)); /* g */
         dst[2] = (float)(rgba * (1.0f/0x7fff)); /* b */
         dst[3] = (float)(rgba * (1.0f/0x7fff)); /* a */
      src += 2;
      dst += 4;
   }
}

void
util_format_i16_snorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)((int16_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7fff)) ;
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_i16_snorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
         uint16_t value = *(const uint16_t *)src;
         int16_t rgba = (int16_t)(value) ;
         dst[0] = (float)(rgba * (1.0f/0x7fff)); /* r */
         dst[1] = (float)(rgba * (1.0f/0x7fff)); /* g */
         dst[2] = (float)(rgba * (1.0f/0x7fff)); /* b */
         dst[3] = (float)(rgba * (1.0f/0x7fff)); /* a */
}

void
util_format_i16_snorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         int16_t rgba = (int16_t)(value) ;
         dst[0] = _mesa_snorm_to_unorm(MAX2(rgba, 0), 16, 8); /* r */
         dst[1] = _mesa_snorm_to_unorm(MAX2(rgba, 0), 16, 8); /* g */
         dst[2] = _mesa_snorm_to_unorm(MAX2(rgba, 0), 16, 8); /* b */
         dst[3] = _mesa_snorm_to_unorm(MAX2(rgba, 0), 16, 8); /* a */
      src += 2;
      dst += 4;
   }
}

void
util_format_i16_snorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)(_mesa_unorm_to_snorm(src[0], 8, 16)) ;
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_a16_float {
   uint16_t a;
};

void
util_format_a16_float_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         struct util_format_a16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = _mesa_half_to_float(pixel.a); /* a */
      src += 2;
      dst += 4;
   }
}

void
util_format_a16_float_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_a16_float pixel = {0};
         pixel.a = _mesa_float_to_float16_rtz(src[3]);
         memcpy(dst, &pixel, sizeof pixel);
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a16_float_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
         struct util_format_a16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = _mesa_half_to_float(pixel.a); /* a */
}

void
util_format_a16_float_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         struct util_format_a16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = float_to_ubyte(_mesa_half_to_float(pixel.a)); /* a */
      src += 2;
      dst += 4;
   }
}

void
util_format_a16_float_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_a16_float pixel = {0};
         pixel.a = _mesa_float_to_float16_rtz((float)(src[3] * (1.0f/0xff)));
         memcpy(dst, &pixel, sizeof pixel);
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_l16_float {
   uint16_t rgb;
};

void
util_format_l16_float_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         struct util_format_l16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = _mesa_half_to_float(pixel.rgb); /* r */
         dst[1] = _mesa_half_to_float(pixel.rgb); /* g */
         dst[2] = _mesa_half_to_float(pixel.rgb); /* b */
         dst[3] = 1; /* a */
      src += 2;
      dst += 4;
   }
}

void
util_format_l16_float_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_l16_float pixel = {0};
         pixel.rgb = _mesa_float_to_float16_rtz(src[0]);
         memcpy(dst, &pixel, sizeof pixel);
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_l16_float_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
         struct util_format_l16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = _mesa_half_to_float(pixel.rgb); /* r */
         dst[1] = _mesa_half_to_float(pixel.rgb); /* g */
         dst[2] = _mesa_half_to_float(pixel.rgb); /* b */
         dst[3] = 1; /* a */
}

void
util_format_l16_float_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         struct util_format_l16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = float_to_ubyte(_mesa_half_to_float(pixel.rgb)); /* r */
         dst[1] = float_to_ubyte(_mesa_half_to_float(pixel.rgb)); /* g */
         dst[2] = float_to_ubyte(_mesa_half_to_float(pixel.rgb)); /* b */
         dst[3] = 255; /* a */
      src += 2;
      dst += 4;
   }
}

void
util_format_l16_float_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_l16_float pixel = {0};
         pixel.rgb = _mesa_float_to_float16_rtz((float)(src[0] * (1.0f/0xff)));
         memcpy(dst, &pixel, sizeof pixel);
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_l16a16_float {
#if UTIL_ARCH_BIG_ENDIAN
   uint16_t rgb;
   uint16_t a;
#else
   uint16_t rgb;
   uint16_t a;
#endif
};

void
util_format_l16a16_float_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_l16a16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = _mesa_half_to_float(pixel.rgb); /* r */
         dst[1] = _mesa_half_to_float(pixel.rgb); /* g */
         dst[2] = _mesa_half_to_float(pixel.rgb); /* b */
         dst[3] = _mesa_half_to_float(pixel.a); /* a */
#else
         struct util_format_l16a16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = _mesa_half_to_float(pixel.rgb); /* r */
         dst[1] = _mesa_half_to_float(pixel.rgb); /* g */
         dst[2] = _mesa_half_to_float(pixel.rgb); /* b */
         dst[3] = _mesa_half_to_float(pixel.a); /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_l16a16_float_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_l16a16_float pixel = {0};
         pixel.rgb = _mesa_float_to_float16_rtz(src[0]);
         pixel.a = _mesa_float_to_float16_rtz(src[3]);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_l16a16_float pixel = {0};
         pixel.rgb = _mesa_float_to_float16_rtz(src[0]);
         pixel.a = _mesa_float_to_float16_rtz(src[3]);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_l16a16_float_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_l16a16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = _mesa_half_to_float(pixel.rgb); /* r */
         dst[1] = _mesa_half_to_float(pixel.rgb); /* g */
         dst[2] = _mesa_half_to_float(pixel.rgb); /* b */
         dst[3] = _mesa_half_to_float(pixel.a); /* a */
#else
         struct util_format_l16a16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = _mesa_half_to_float(pixel.rgb); /* r */
         dst[1] = _mesa_half_to_float(pixel.rgb); /* g */
         dst[2] = _mesa_half_to_float(pixel.rgb); /* b */
         dst[3] = _mesa_half_to_float(pixel.a); /* a */
#endif
}

void
util_format_l16a16_float_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_l16a16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = float_to_ubyte(_mesa_half_to_float(pixel.rgb)); /* r */
         dst[1] = float_to_ubyte(_mesa_half_to_float(pixel.rgb)); /* g */
         dst[2] = float_to_ubyte(_mesa_half_to_float(pixel.rgb)); /* b */
         dst[3] = float_to_ubyte(_mesa_half_to_float(pixel.a)); /* a */
#else
         struct util_format_l16a16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = float_to_ubyte(_mesa_half_to_float(pixel.rgb)); /* r */
         dst[1] = float_to_ubyte(_mesa_half_to_float(pixel.rgb)); /* g */
         dst[2] = float_to_ubyte(_mesa_half_to_float(pixel.rgb)); /* b */
         dst[3] = float_to_ubyte(_mesa_half_to_float(pixel.a)); /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_l16a16_float_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_l16a16_float pixel = {0};
         pixel.rgb = _mesa_float_to_float16_rtz((float)(src[0] * (1.0f/0xff)));
         pixel.a = _mesa_float_to_float16_rtz((float)(src[3] * (1.0f/0xff)));
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_l16a16_float pixel = {0};
         pixel.rgb = _mesa_float_to_float16_rtz((float)(src[0] * (1.0f/0xff)));
         pixel.a = _mesa_float_to_float16_rtz((float)(src[3] * (1.0f/0xff)));
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_i16_float {
   uint16_t rgba;
};

void
util_format_i16_float_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         struct util_format_i16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = _mesa_half_to_float(pixel.rgba); /* r */
         dst[1] = _mesa_half_to_float(pixel.rgba); /* g */
         dst[2] = _mesa_half_to_float(pixel.rgba); /* b */
         dst[3] = _mesa_half_to_float(pixel.rgba); /* a */
      src += 2;
      dst += 4;
   }
}

void
util_format_i16_float_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_i16_float pixel = {0};
         pixel.rgba = _mesa_float_to_float16_rtz(src[0]);
         memcpy(dst, &pixel, sizeof pixel);
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_i16_float_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
         struct util_format_i16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = _mesa_half_to_float(pixel.rgba); /* r */
         dst[1] = _mesa_half_to_float(pixel.rgba); /* g */
         dst[2] = _mesa_half_to_float(pixel.rgba); /* b */
         dst[3] = _mesa_half_to_float(pixel.rgba); /* a */
}

void
util_format_i16_float_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         struct util_format_i16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = float_to_ubyte(_mesa_half_to_float(pixel.rgba)); /* r */
         dst[1] = float_to_ubyte(_mesa_half_to_float(pixel.rgba)); /* g */
         dst[2] = float_to_ubyte(_mesa_half_to_float(pixel.rgba)); /* b */
         dst[3] = float_to_ubyte(_mesa_half_to_float(pixel.rgba)); /* a */
      src += 2;
      dst += 4;
   }
}

void
util_format_i16_float_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_i16_float pixel = {0};
         pixel.rgba = _mesa_float_to_float16_rtz((float)(src[0] * (1.0f/0xff)));
         memcpy(dst, &pixel, sizeof pixel);
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_a32_float {
   float a;
};

void
util_format_a32_float_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         struct util_format_a32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = pixel.a; /* a */
      src += 4;
      dst += 4;
   }
}

void
util_format_a32_float_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_a32_float pixel = {0};
         pixel.a = src[3];
         memcpy(dst, &pixel, sizeof pixel);
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a32_float_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
         struct util_format_a32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = pixel.a; /* a */
}

void
util_format_a32_float_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         struct util_format_a32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = float_to_ubyte(pixel.a); /* a */
      src += 4;
      dst += 4;
   }
}

void
util_format_a32_float_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_a32_float pixel = {0};
         pixel.a = ubyte_to_float(src[3]);
         memcpy(dst, &pixel, sizeof pixel);
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_l32_float {
   float rgb;
};

void
util_format_l32_float_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         struct util_format_l32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.rgb; /* r */
         dst[1] = pixel.rgb; /* g */
         dst[2] = pixel.rgb; /* b */
         dst[3] = 1; /* a */
      src += 4;
      dst += 4;
   }
}

void
util_format_l32_float_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_l32_float pixel = {0};
         pixel.rgb = src[0];
         memcpy(dst, &pixel, sizeof pixel);
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_l32_float_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
         struct util_format_l32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.rgb; /* r */
         dst[1] = pixel.rgb; /* g */
         dst[2] = pixel.rgb; /* b */
         dst[3] = 1; /* a */
}

void
util_format_l32_float_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         struct util_format_l32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = float_to_ubyte(pixel.rgb); /* r */
         dst[1] = float_to_ubyte(pixel.rgb); /* g */
         dst[2] = float_to_ubyte(pixel.rgb); /* b */
         dst[3] = 255; /* a */
      src += 4;
      dst += 4;
   }
}

void
util_format_l32_float_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_l32_float pixel = {0};
         pixel.rgb = ubyte_to_float(src[0]);
         memcpy(dst, &pixel, sizeof pixel);
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_l32a32_float {
#if UTIL_ARCH_BIG_ENDIAN
   float rgb;
   float a;
#else
   float rgb;
   float a;
#endif
};

void
util_format_l32a32_float_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_l32a32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.rgb; /* r */
         dst[1] = pixel.rgb; /* g */
         dst[2] = pixel.rgb; /* b */
         dst[3] = pixel.a; /* a */
#else
         struct util_format_l32a32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.rgb; /* r */
         dst[1] = pixel.rgb; /* g */
         dst[2] = pixel.rgb; /* b */
         dst[3] = pixel.a; /* a */
#endif
      src += 8;
      dst += 4;
   }
}

void
util_format_l32a32_float_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_l32a32_float pixel = {0};
         pixel.rgb = src[0];
         pixel.a = src[3];
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_l32a32_float pixel = {0};
         pixel.rgb = src[0];
         pixel.a = src[3];
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_l32a32_float_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_l32a32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.rgb; /* r */
         dst[1] = pixel.rgb; /* g */
         dst[2] = pixel.rgb; /* b */
         dst[3] = pixel.a; /* a */
#else
         struct util_format_l32a32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.rgb; /* r */
         dst[1] = pixel.rgb; /* g */
         dst[2] = pixel.rgb; /* b */
         dst[3] = pixel.a; /* a */
#endif
}

void
util_format_l32a32_float_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_l32a32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = float_to_ubyte(pixel.rgb); /* r */
         dst[1] = float_to_ubyte(pixel.rgb); /* g */
         dst[2] = float_to_ubyte(pixel.rgb); /* b */
         dst[3] = float_to_ubyte(pixel.a); /* a */
#else
         struct util_format_l32a32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = float_to_ubyte(pixel.rgb); /* r */
         dst[1] = float_to_ubyte(pixel.rgb); /* g */
         dst[2] = float_to_ubyte(pixel.rgb); /* b */
         dst[3] = float_to_ubyte(pixel.a); /* a */
#endif
      src += 8;
      dst += 4;
   }
}

void
util_format_l32a32_float_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_l32a32_float pixel = {0};
         pixel.rgb = ubyte_to_float(src[0]);
         pixel.a = ubyte_to_float(src[3]);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_l32a32_float pixel = {0};
         pixel.rgb = ubyte_to_float(src[0]);
         pixel.a = ubyte_to_float(src[3]);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_i32_float {
   float rgba;
};

void
util_format_i32_float_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         struct util_format_i32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.rgba; /* r */
         dst[1] = pixel.rgba; /* g */
         dst[2] = pixel.rgba; /* b */
         dst[3] = pixel.rgba; /* a */
      src += 4;
      dst += 4;
   }
}

void
util_format_i32_float_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_i32_float pixel = {0};
         pixel.rgba = src[0];
         memcpy(dst, &pixel, sizeof pixel);
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_i32_float_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
         struct util_format_i32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.rgba; /* r */
         dst[1] = pixel.rgba; /* g */
         dst[2] = pixel.rgba; /* b */
         dst[3] = pixel.rgba; /* a */
}

void
util_format_i32_float_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         struct util_format_i32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = float_to_ubyte(pixel.rgba); /* r */
         dst[1] = float_to_ubyte(pixel.rgba); /* g */
         dst[2] = float_to_ubyte(pixel.rgba); /* b */
         dst[3] = float_to_ubyte(pixel.rgba); /* a */
      src += 4;
      dst += 4;
   }
}

void
util_format_i32_float_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_i32_float pixel = {0};
         pixel.rgba = ubyte_to_float(src[0]);
         memcpy(dst, &pixel, sizeof pixel);
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_l8_srgb_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         uint8_t rgb = value;
         dst[0] = util_format_srgb_8unorm_to_linear_float(rgb); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(rgb); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(rgb); /* b */
         dst[3] = 1; /* a */
      src += 1;
      dst += 4;
   }
}

void
util_format_l8_srgb_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= util_format_linear_float_to_srgb_8unorm(src[0]);
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_l8_srgb_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
         uint8_t value = *(const uint8_t *)src;
         uint8_t rgb = value;
         dst[0] = util_format_srgb_8unorm_to_linear_float(rgb); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(rgb); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(rgb); /* b */
         dst[3] = 1; /* a */
}

void
util_format_l8_srgb_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         uint8_t rgb = value;
         dst[0] = util_format_srgb_to_linear_8unorm(rgb); /* r */
         dst[1] = util_format_srgb_to_linear_8unorm(rgb); /* g */
         dst[2] = util_format_srgb_to_linear_8unorm(rgb); /* b */
         dst[3] = 255; /* a */
      src += 1;
      dst += 4;
   }
}

void
util_format_l8_srgb_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= util_format_linear_to_srgb_8unorm(src[0]);
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r8_srgb_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         uint8_t r = value;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
      src += 1;
      dst += 4;
   }
}

void
util_format_r8_srgb_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= util_format_linear_float_to_srgb_8unorm(src[0]);
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r8_srgb_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
         uint8_t value = *(const uint8_t *)src;
         uint8_t r = value;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
}

void
util_format_r8_srgb_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         uint8_t r = value;
         dst[0] = util_format_srgb_to_linear_8unorm(r); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
      src += 1;
      dst += 4;
   }
}

void
util_format_r8_srgb_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= util_format_linear_to_srgb_8unorm(src[0]);
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_l8a8_srgb_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t rgb = value >> 8;
         uint16_t a = (value) & 0xff;
         dst[0] = util_format_srgb_8unorm_to_linear_float(rgb); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(rgb); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(rgb); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t rgb = (value) & 0xff;
         uint16_t a = value >> 8;
         dst[0] = util_format_srgb_8unorm_to_linear_float(rgb); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(rgb); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(rgb); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_l8a8_srgb_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)(util_format_linear_float_to_srgb_8unorm(src[0])) << 8;
         value |= (float_to_ubyte(src[3])) & 0xff;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (util_format_linear_float_to_srgb_8unorm(src[0])) & 0xff;
         value |= (uint32_t)(float_to_ubyte(src[3])) << 8;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_l8a8_srgb_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t rgb = value >> 8;
         uint16_t a = (value) & 0xff;
         dst[0] = util_format_srgb_8unorm_to_linear_float(rgb); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(rgb); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(rgb); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t rgb = (value) & 0xff;
         uint16_t a = value >> 8;
         dst[0] = util_format_srgb_8unorm_to_linear_float(rgb); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(rgb); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(rgb); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#endif
}

void
util_format_l8a8_srgb_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t rgb = value >> 8;
         uint16_t a = (value) & 0xff;
         dst[0] = util_format_srgb_to_linear_8unorm(rgb); /* r */
         dst[1] = util_format_srgb_to_linear_8unorm(rgb); /* g */
         dst[2] = util_format_srgb_to_linear_8unorm(rgb); /* b */
         dst[3] = a; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t rgb = (value) & 0xff;
         uint16_t a = value >> 8;
         dst[0] = util_format_srgb_to_linear_8unorm(rgb); /* r */
         dst[1] = util_format_srgb_to_linear_8unorm(rgb); /* g */
         dst[2] = util_format_srgb_to_linear_8unorm(rgb); /* b */
         dst[3] = a; /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_l8a8_srgb_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)(util_format_linear_to_srgb_8unorm(src[0])) << 8;
         value |= (src[3]) & 0xff;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (util_format_linear_to_srgb_8unorm(src[0])) & 0xff;
         value |= (uint32_t)(src[3]) << 8;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r8g8_srgb_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = value >> 8;
         uint16_t g = (value) & 0xff;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = (value) & 0xff;
         uint16_t g = value >> 8;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_r8g8_srgb_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)(util_format_linear_float_to_srgb_8unorm(src[0])) << 8;
         value |= (util_format_linear_float_to_srgb_8unorm(src[1])) & 0xff;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (util_format_linear_float_to_srgb_8unorm(src[0])) & 0xff;
         value |= (uint32_t)(util_format_linear_float_to_srgb_8unorm(src[1])) << 8;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r8g8_srgb_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = value >> 8;
         uint16_t g = (value) & 0xff;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = (value) & 0xff;
         uint16_t g = value >> 8;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_r8g8_srgb_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = value >> 8;
         uint16_t g = (value) & 0xff;
         dst[0] = util_format_srgb_to_linear_8unorm(r); /* r */
         dst[1] = util_format_srgb_to_linear_8unorm(g); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = (value) & 0xff;
         uint16_t g = value >> 8;
         dst[0] = util_format_srgb_to_linear_8unorm(r); /* r */
         dst[1] = util_format_srgb_to_linear_8unorm(g); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_r8g8_srgb_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)(util_format_linear_to_srgb_8unorm(src[0])) << 8;
         value |= (util_format_linear_to_srgb_8unorm(src[1])) & 0xff;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (util_format_linear_to_srgb_8unorm(src[0])) & 0xff;
         value |= (uint32_t)(util_format_linear_to_srgb_8unorm(src[1])) << 8;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r8g8b8_srgb {
#if UTIL_ARCH_BIG_ENDIAN
   uint8_t r;
   uint8_t g;
   uint8_t b;
#else
   uint8_t r;
   uint8_t g;
   uint8_t b;
#endif
};

void
util_format_r8g8b8_srgb_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_srgb pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = util_format_srgb_8unorm_to_linear_float(pixel.r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(pixel.g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(pixel.b); /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r8g8b8_srgb pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = util_format_srgb_8unorm_to_linear_float(pixel.r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(pixel.g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(pixel.b); /* b */
         dst[3] = 1; /* a */
#endif
      src += 3;
      dst += 4;
   }
}

void
util_format_r8g8b8_srgb_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_srgb pixel = {0};
         pixel.r = util_format_linear_float_to_srgb_8unorm(src[0]);
         pixel.g = util_format_linear_float_to_srgb_8unorm(src[1]);
         pixel.b = util_format_linear_float_to_srgb_8unorm(src[2]);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r8g8b8_srgb pixel = {0};
         pixel.r = util_format_linear_float_to_srgb_8unorm(src[0]);
         pixel.g = util_format_linear_float_to_srgb_8unorm(src[1]);
         pixel.b = util_format_linear_float_to_srgb_8unorm(src[2]);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 3;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r8g8b8_srgb_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_srgb pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = util_format_srgb_8unorm_to_linear_float(pixel.r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(pixel.g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(pixel.b); /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r8g8b8_srgb pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = util_format_srgb_8unorm_to_linear_float(pixel.r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(pixel.g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(pixel.b); /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_r8g8b8_srgb_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_srgb pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = util_format_srgb_to_linear_8unorm(pixel.r); /* r */
         dst[1] = util_format_srgb_to_linear_8unorm(pixel.g); /* g */
         dst[2] = util_format_srgb_to_linear_8unorm(pixel.b); /* b */
         dst[3] = 255; /* a */
#else
         struct util_format_r8g8b8_srgb pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = util_format_srgb_to_linear_8unorm(pixel.r); /* r */
         dst[1] = util_format_srgb_to_linear_8unorm(pixel.g); /* g */
         dst[2] = util_format_srgb_to_linear_8unorm(pixel.b); /* b */
         dst[3] = 255; /* a */
#endif
      src += 3;
      dst += 4;
   }
}

void
util_format_r8g8b8_srgb_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_srgb pixel = {0};
         pixel.r = util_format_linear_to_srgb_8unorm(src[0]);
         pixel.g = util_format_linear_to_srgb_8unorm(src[1]);
         pixel.b = util_format_linear_to_srgb_8unorm(src[2]);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r8g8b8_srgb pixel = {0};
         pixel.r = util_format_linear_to_srgb_8unorm(src[0]);
         pixel.g = util_format_linear_to_srgb_8unorm(src[1]);
         pixel.b = util_format_linear_to_srgb_8unorm(src[2]);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 3;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_b8g8r8_srgb {
#if UTIL_ARCH_BIG_ENDIAN
   uint8_t b;
   uint8_t g;
   uint8_t r;
#else
   uint8_t b;
   uint8_t g;
   uint8_t r;
#endif
};

void
util_format_b8g8r8_srgb_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_srgb pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = util_format_srgb_8unorm_to_linear_float(pixel.r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(pixel.g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(pixel.b); /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_b8g8r8_srgb pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = util_format_srgb_8unorm_to_linear_float(pixel.r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(pixel.g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(pixel.b); /* b */
         dst[3] = 1; /* a */
#endif
      src += 3;
      dst += 4;
   }
}

void
util_format_b8g8r8_srgb_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_srgb pixel = {0};
         pixel.b = util_format_linear_float_to_srgb_8unorm(src[2]);
         pixel.g = util_format_linear_float_to_srgb_8unorm(src[1]);
         pixel.r = util_format_linear_float_to_srgb_8unorm(src[0]);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_b8g8r8_srgb pixel = {0};
         pixel.b = util_format_linear_float_to_srgb_8unorm(src[2]);
         pixel.g = util_format_linear_float_to_srgb_8unorm(src[1]);
         pixel.r = util_format_linear_float_to_srgb_8unorm(src[0]);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 3;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b8g8r8_srgb_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_srgb pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = util_format_srgb_8unorm_to_linear_float(pixel.r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(pixel.g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(pixel.b); /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_b8g8r8_srgb pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = util_format_srgb_8unorm_to_linear_float(pixel.r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(pixel.g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(pixel.b); /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_b8g8r8_srgb_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_srgb pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = util_format_srgb_to_linear_8unorm(pixel.r); /* r */
         dst[1] = util_format_srgb_to_linear_8unorm(pixel.g); /* g */
         dst[2] = util_format_srgb_to_linear_8unorm(pixel.b); /* b */
         dst[3] = 255; /* a */
#else
         struct util_format_b8g8r8_srgb pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = util_format_srgb_to_linear_8unorm(pixel.r); /* r */
         dst[1] = util_format_srgb_to_linear_8unorm(pixel.g); /* g */
         dst[2] = util_format_srgb_to_linear_8unorm(pixel.b); /* b */
         dst[3] = 255; /* a */
#endif
      src += 3;
      dst += 4;
   }
}

void
util_format_b8g8r8_srgb_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_srgb pixel = {0};
         pixel.b = util_format_linear_to_srgb_8unorm(src[2]);
         pixel.g = util_format_linear_to_srgb_8unorm(src[1]);
         pixel.r = util_format_linear_to_srgb_8unorm(src[0]);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_b8g8r8_srgb pixel = {0};
         pixel.b = util_format_linear_to_srgb_8unorm(src[2]);
         pixel.g = util_format_linear_to_srgb_8unorm(src[1]);
         pixel.r = util_format_linear_to_srgb_8unorm(src[0]);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 3;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r8g8b8a8_srgb_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = value >> 24;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t b = (value >> 8) & 0xff;
         uint32_t a = (value) & 0xff;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = (value) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t b = (value >> 16) & 0xff;
         uint32_t a = value >> 24;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_r8g8b8a8_srgb_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(util_format_linear_float_to_srgb_8unorm(src[0])) << 24;
         value |= (uint32_t)((util_format_linear_float_to_srgb_8unorm(src[1])) & 0xff) << 16;
         value |= (uint32_t)((util_format_linear_float_to_srgb_8unorm(src[2])) & 0xff) << 8;
         value |= (float_to_ubyte(src[3])) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (util_format_linear_float_to_srgb_8unorm(src[0])) & 0xff;
         value |= (uint32_t)((util_format_linear_float_to_srgb_8unorm(src[1])) & 0xff) << 8;
         value |= (uint32_t)((util_format_linear_float_to_srgb_8unorm(src[2])) & 0xff) << 16;
         value |= (uint32_t)(float_to_ubyte(src[3])) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r8g8b8a8_srgb_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = value >> 24;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t b = (value >> 8) & 0xff;
         uint32_t a = (value) & 0xff;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = (value) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t b = (value >> 16) & 0xff;
         uint32_t a = value >> 24;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#endif
}

void
util_format_r8g8b8a8_srgb_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = value >> 24;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t b = (value >> 8) & 0xff;
         uint32_t a = (value) & 0xff;
         dst[0] = util_format_srgb_to_linear_8unorm(r); /* r */
         dst[1] = util_format_srgb_to_linear_8unorm(g); /* g */
         dst[2] = util_format_srgb_to_linear_8unorm(b); /* b */
         dst[3] = a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = (value) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t b = (value >> 16) & 0xff;
         uint32_t a = value >> 24;
         dst[0] = util_format_srgb_to_linear_8unorm(r); /* r */
         dst[1] = util_format_srgb_to_linear_8unorm(g); /* g */
         dst[2] = util_format_srgb_to_linear_8unorm(b); /* b */
         dst[3] = a; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_r8g8b8a8_srgb_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(util_format_linear_to_srgb_8unorm(src[0])) << 24;
         value |= (uint32_t)((util_format_linear_to_srgb_8unorm(src[1])) & 0xff) << 16;
         value |= (uint32_t)((util_format_linear_to_srgb_8unorm(src[2])) & 0xff) << 8;
         value |= (src[3]) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (util_format_linear_to_srgb_8unorm(src[0])) & 0xff;
         value |= (uint32_t)((util_format_linear_to_srgb_8unorm(src[1])) & 0xff) << 8;
         value |= (uint32_t)((util_format_linear_to_srgb_8unorm(src[2])) & 0xff) << 16;
         value |= (uint32_t)(src[3]) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a8b8g8r8_srgb_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = value >> 24;
         uint32_t b = (value >> 16) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t r = (value) & 0xff;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = (value) & 0xff;
         uint32_t b = (value >> 8) & 0xff;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t r = value >> 24;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_a8b8g8r8_srgb_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(float_to_ubyte(src[3])) << 24;
         value |= (uint32_t)((util_format_linear_float_to_srgb_8unorm(src[2])) & 0xff) << 16;
         value |= (uint32_t)((util_format_linear_float_to_srgb_8unorm(src[1])) & 0xff) << 8;
         value |= (util_format_linear_float_to_srgb_8unorm(src[0])) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (float_to_ubyte(src[3])) & 0xff;
         value |= (uint32_t)((util_format_linear_float_to_srgb_8unorm(src[2])) & 0xff) << 8;
         value |= (uint32_t)((util_format_linear_float_to_srgb_8unorm(src[1])) & 0xff) << 16;
         value |= (uint32_t)(util_format_linear_float_to_srgb_8unorm(src[0])) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a8b8g8r8_srgb_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = value >> 24;
         uint32_t b = (value >> 16) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t r = (value) & 0xff;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = (value) & 0xff;
         uint32_t b = (value >> 8) & 0xff;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t r = value >> 24;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#endif
}

void
util_format_a8b8g8r8_srgb_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = value >> 24;
         uint32_t b = (value >> 16) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t r = (value) & 0xff;
         dst[0] = util_format_srgb_to_linear_8unorm(r); /* r */
         dst[1] = util_format_srgb_to_linear_8unorm(g); /* g */
         dst[2] = util_format_srgb_to_linear_8unorm(b); /* b */
         dst[3] = a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = (value) & 0xff;
         uint32_t b = (value >> 8) & 0xff;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t r = value >> 24;
         dst[0] = util_format_srgb_to_linear_8unorm(r); /* r */
         dst[1] = util_format_srgb_to_linear_8unorm(g); /* g */
         dst[2] = util_format_srgb_to_linear_8unorm(b); /* b */
         dst[3] = a; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_a8b8g8r8_srgb_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(src[3]) << 24;
         value |= (uint32_t)((util_format_linear_to_srgb_8unorm(src[2])) & 0xff) << 16;
         value |= (uint32_t)((util_format_linear_to_srgb_8unorm(src[1])) & 0xff) << 8;
         value |= (util_format_linear_to_srgb_8unorm(src[0])) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (src[3]) & 0xff;
         value |= (uint32_t)((util_format_linear_to_srgb_8unorm(src[2])) & 0xff) << 8;
         value |= (uint32_t)((util_format_linear_to_srgb_8unorm(src[1])) & 0xff) << 16;
         value |= (uint32_t)(util_format_linear_to_srgb_8unorm(src[0])) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_x8b8g8r8_srgb_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = (value >> 16) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t r = (value) & 0xff;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(b); /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = (value >> 8) & 0xff;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t r = value >> 24;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(b); /* b */
         dst[3] = 1; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_x8b8g8r8_srgb_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((util_format_linear_float_to_srgb_8unorm(src[2])) & 0xff) << 16;
         value |= (uint32_t)((util_format_linear_float_to_srgb_8unorm(src[1])) & 0xff) << 8;
         value |= (util_format_linear_float_to_srgb_8unorm(src[0])) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)((util_format_linear_float_to_srgb_8unorm(src[2])) & 0xff) << 8;
         value |= (uint32_t)((util_format_linear_float_to_srgb_8unorm(src[1])) & 0xff) << 16;
         value |= (uint32_t)(util_format_linear_float_to_srgb_8unorm(src[0])) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_x8b8g8r8_srgb_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = (value >> 16) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t r = (value) & 0xff;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(b); /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = (value >> 8) & 0xff;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t r = value >> 24;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(b); /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_x8b8g8r8_srgb_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = (value >> 16) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t r = (value) & 0xff;
         dst[0] = util_format_srgb_to_linear_8unorm(r); /* r */
         dst[1] = util_format_srgb_to_linear_8unorm(g); /* g */
         dst[2] = util_format_srgb_to_linear_8unorm(b); /* b */
         dst[3] = 255; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = (value >> 8) & 0xff;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t r = value >> 24;
         dst[0] = util_format_srgb_to_linear_8unorm(r); /* r */
         dst[1] = util_format_srgb_to_linear_8unorm(g); /* g */
         dst[2] = util_format_srgb_to_linear_8unorm(b); /* b */
         dst[3] = 255; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_x8b8g8r8_srgb_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((util_format_linear_to_srgb_8unorm(src[2])) & 0xff) << 16;
         value |= (uint32_t)((util_format_linear_to_srgb_8unorm(src[1])) & 0xff) << 8;
         value |= (util_format_linear_to_srgb_8unorm(src[0])) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)((util_format_linear_to_srgb_8unorm(src[2])) & 0xff) << 8;
         value |= (uint32_t)((util_format_linear_to_srgb_8unorm(src[1])) & 0xff) << 16;
         value |= (uint32_t)(util_format_linear_to_srgb_8unorm(src[0])) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b8g8r8a8_srgb_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = value >> 24;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t r = (value >> 8) & 0xff;
         uint32_t a = (value) & 0xff;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = (value) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t r = (value >> 16) & 0xff;
         uint32_t a = value >> 24;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_b8g8r8a8_srgb_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(util_format_linear_float_to_srgb_8unorm(src[2])) << 24;
         value |= (uint32_t)((util_format_linear_float_to_srgb_8unorm(src[1])) & 0xff) << 16;
         value |= (uint32_t)((util_format_linear_float_to_srgb_8unorm(src[0])) & 0xff) << 8;
         value |= (float_to_ubyte(src[3])) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (util_format_linear_float_to_srgb_8unorm(src[2])) & 0xff;
         value |= (uint32_t)((util_format_linear_float_to_srgb_8unorm(src[1])) & 0xff) << 8;
         value |= (uint32_t)((util_format_linear_float_to_srgb_8unorm(src[0])) & 0xff) << 16;
         value |= (uint32_t)(float_to_ubyte(src[3])) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b8g8r8a8_srgb_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = value >> 24;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t r = (value >> 8) & 0xff;
         uint32_t a = (value) & 0xff;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = (value) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t r = (value >> 16) & 0xff;
         uint32_t a = value >> 24;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#endif
}

void
util_format_b8g8r8a8_srgb_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = value >> 24;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t r = (value >> 8) & 0xff;
         uint32_t a = (value) & 0xff;
         dst[0] = util_format_srgb_to_linear_8unorm(r); /* r */
         dst[1] = util_format_srgb_to_linear_8unorm(g); /* g */
         dst[2] = util_format_srgb_to_linear_8unorm(b); /* b */
         dst[3] = a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = (value) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t r = (value >> 16) & 0xff;
         uint32_t a = value >> 24;
         dst[0] = util_format_srgb_to_linear_8unorm(r); /* r */
         dst[1] = util_format_srgb_to_linear_8unorm(g); /* g */
         dst[2] = util_format_srgb_to_linear_8unorm(b); /* b */
         dst[3] = a; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_b8g8r8a8_srgb_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(util_format_linear_to_srgb_8unorm(src[2])) << 24;
         value |= (uint32_t)((util_format_linear_to_srgb_8unorm(src[1])) & 0xff) << 16;
         value |= (uint32_t)((util_format_linear_to_srgb_8unorm(src[0])) & 0xff) << 8;
         value |= (src[3]) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (util_format_linear_to_srgb_8unorm(src[2])) & 0xff;
         value |= (uint32_t)((util_format_linear_to_srgb_8unorm(src[1])) & 0xff) << 8;
         value |= (uint32_t)((util_format_linear_to_srgb_8unorm(src[0])) & 0xff) << 16;
         value |= (uint32_t)(src[3]) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b8g8r8x8_srgb_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = value >> 24;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t r = (value >> 8) & 0xff;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(b); /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = (value) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t r = (value >> 16) & 0xff;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(b); /* b */
         dst[3] = 1; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_b8g8r8x8_srgb_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(util_format_linear_float_to_srgb_8unorm(src[2])) << 24;
         value |= (uint32_t)((util_format_linear_float_to_srgb_8unorm(src[1])) & 0xff) << 16;
         value |= (uint32_t)((util_format_linear_float_to_srgb_8unorm(src[0])) & 0xff) << 8;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (util_format_linear_float_to_srgb_8unorm(src[2])) & 0xff;
         value |= (uint32_t)((util_format_linear_float_to_srgb_8unorm(src[1])) & 0xff) << 8;
         value |= (uint32_t)((util_format_linear_float_to_srgb_8unorm(src[0])) & 0xff) << 16;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b8g8r8x8_srgb_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = value >> 24;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t r = (value >> 8) & 0xff;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(b); /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = (value) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t r = (value >> 16) & 0xff;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(b); /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_b8g8r8x8_srgb_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = value >> 24;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t r = (value >> 8) & 0xff;
         dst[0] = util_format_srgb_to_linear_8unorm(r); /* r */
         dst[1] = util_format_srgb_to_linear_8unorm(g); /* g */
         dst[2] = util_format_srgb_to_linear_8unorm(b); /* b */
         dst[3] = 255; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = (value) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t r = (value >> 16) & 0xff;
         dst[0] = util_format_srgb_to_linear_8unorm(r); /* r */
         dst[1] = util_format_srgb_to_linear_8unorm(g); /* g */
         dst[2] = util_format_srgb_to_linear_8unorm(b); /* b */
         dst[3] = 255; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_b8g8r8x8_srgb_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(util_format_linear_to_srgb_8unorm(src[2])) << 24;
         value |= (uint32_t)((util_format_linear_to_srgb_8unorm(src[1])) & 0xff) << 16;
         value |= (uint32_t)((util_format_linear_to_srgb_8unorm(src[0])) & 0xff) << 8;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (util_format_linear_to_srgb_8unorm(src[2])) & 0xff;
         value |= (uint32_t)((util_format_linear_to_srgb_8unorm(src[1])) & 0xff) << 8;
         value |= (uint32_t)((util_format_linear_to_srgb_8unorm(src[0])) & 0xff) << 16;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a8r8g8b8_srgb_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = value >> 24;
         uint32_t r = (value >> 16) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t b = (value) & 0xff;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = (value) & 0xff;
         uint32_t r = (value >> 8) & 0xff;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t b = value >> 24;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_a8r8g8b8_srgb_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(float_to_ubyte(src[3])) << 24;
         value |= (uint32_t)((util_format_linear_float_to_srgb_8unorm(src[0])) & 0xff) << 16;
         value |= (uint32_t)((util_format_linear_float_to_srgb_8unorm(src[1])) & 0xff) << 8;
         value |= (util_format_linear_float_to_srgb_8unorm(src[2])) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (float_to_ubyte(src[3])) & 0xff;
         value |= (uint32_t)((util_format_linear_float_to_srgb_8unorm(src[0])) & 0xff) << 8;
         value |= (uint32_t)((util_format_linear_float_to_srgb_8unorm(src[1])) & 0xff) << 16;
         value |= (uint32_t)(util_format_linear_float_to_srgb_8unorm(src[2])) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a8r8g8b8_srgb_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = value >> 24;
         uint32_t r = (value >> 16) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t b = (value) & 0xff;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = (value) & 0xff;
         uint32_t r = (value >> 8) & 0xff;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t b = value >> 24;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#endif
}

void
util_format_a8r8g8b8_srgb_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = value >> 24;
         uint32_t r = (value >> 16) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t b = (value) & 0xff;
         dst[0] = util_format_srgb_to_linear_8unorm(r); /* r */
         dst[1] = util_format_srgb_to_linear_8unorm(g); /* g */
         dst[2] = util_format_srgb_to_linear_8unorm(b); /* b */
         dst[3] = a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = (value) & 0xff;
         uint32_t r = (value >> 8) & 0xff;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t b = value >> 24;
         dst[0] = util_format_srgb_to_linear_8unorm(r); /* r */
         dst[1] = util_format_srgb_to_linear_8unorm(g); /* g */
         dst[2] = util_format_srgb_to_linear_8unorm(b); /* b */
         dst[3] = a; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_a8r8g8b8_srgb_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(src[3]) << 24;
         value |= (uint32_t)((util_format_linear_to_srgb_8unorm(src[0])) & 0xff) << 16;
         value |= (uint32_t)((util_format_linear_to_srgb_8unorm(src[1])) & 0xff) << 8;
         value |= (util_format_linear_to_srgb_8unorm(src[2])) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (src[3]) & 0xff;
         value |= (uint32_t)((util_format_linear_to_srgb_8unorm(src[0])) & 0xff) << 8;
         value |= (uint32_t)((util_format_linear_to_srgb_8unorm(src[1])) & 0xff) << 16;
         value |= (uint32_t)(util_format_linear_to_srgb_8unorm(src[2])) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_x8r8g8b8_srgb_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = (value >> 16) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t b = (value) & 0xff;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(b); /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = (value >> 8) & 0xff;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t b = value >> 24;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(b); /* b */
         dst[3] = 1; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_x8r8g8b8_srgb_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((util_format_linear_float_to_srgb_8unorm(src[0])) & 0xff) << 16;
         value |= (uint32_t)((util_format_linear_float_to_srgb_8unorm(src[1])) & 0xff) << 8;
         value |= (util_format_linear_float_to_srgb_8unorm(src[2])) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)((util_format_linear_float_to_srgb_8unorm(src[0])) & 0xff) << 8;
         value |= (uint32_t)((util_format_linear_float_to_srgb_8unorm(src[1])) & 0xff) << 16;
         value |= (uint32_t)(util_format_linear_float_to_srgb_8unorm(src[2])) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_x8r8g8b8_srgb_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = (value >> 16) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t b = (value) & 0xff;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(b); /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = (value >> 8) & 0xff;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t b = value >> 24;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(b); /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_x8r8g8b8_srgb_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = (value >> 16) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t b = (value) & 0xff;
         dst[0] = util_format_srgb_to_linear_8unorm(r); /* r */
         dst[1] = util_format_srgb_to_linear_8unorm(g); /* g */
         dst[2] = util_format_srgb_to_linear_8unorm(b); /* b */
         dst[3] = 255; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = (value >> 8) & 0xff;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t b = value >> 24;
         dst[0] = util_format_srgb_to_linear_8unorm(r); /* r */
         dst[1] = util_format_srgb_to_linear_8unorm(g); /* g */
         dst[2] = util_format_srgb_to_linear_8unorm(b); /* b */
         dst[3] = 255; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_x8r8g8b8_srgb_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((util_format_linear_to_srgb_8unorm(src[0])) & 0xff) << 16;
         value |= (uint32_t)((util_format_linear_to_srgb_8unorm(src[1])) & 0xff) << 8;
         value |= (util_format_linear_to_srgb_8unorm(src[2])) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)((util_format_linear_to_srgb_8unorm(src[0])) & 0xff) << 8;
         value |= (uint32_t)((util_format_linear_to_srgb_8unorm(src[1])) & 0xff) << 16;
         value |= (uint32_t)(util_format_linear_to_srgb_8unorm(src[2])) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r8sg8sb8ux8u_norm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = (value >> 16) & 0xff;
         int32_t g = ((int32_t)(value << 16) ) >> 24;
         int32_t r = ((int32_t)(value << 24) ) >> 24;
         dst[0] = (float)(r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(g * (1.0f/0x7f)); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r = ((int32_t)(value << 24) ) >> 24;
         int32_t g = ((int32_t)(value << 16) ) >> 24;
         uint32_t b = (value >> 16) & 0xff;
         dst[0] = (float)(r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(g * (1.0f/0x7f)); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = 1; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_r8sg8sb8ux8u_norm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((float_to_ubyte(src[2])) & 0xff) << 16;
         value |= (uint32_t)((uint32_t)(((uint32_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0x7f)) & 0xff) << 8) ;
         value |= (uint32_t)(((uint32_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7f)) & 0xff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((uint32_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7f)) & 0xff) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0x7f)) & 0xff) << 8) ;
         value |= (uint32_t)((float_to_ubyte(src[2])) & 0xff) << 16;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r8sg8sb8ux8u_norm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = (value >> 16) & 0xff;
         int32_t g = ((int32_t)(value << 16) ) >> 24;
         int32_t r = ((int32_t)(value << 24) ) >> 24;
         dst[0] = (float)(r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(g * (1.0f/0x7f)); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r = ((int32_t)(value << 24) ) >> 24;
         int32_t g = ((int32_t)(value << 16) ) >> 24;
         uint32_t b = (value >> 16) & 0xff;
         dst[0] = (float)(r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(g * (1.0f/0x7f)); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_r8sg8sb8ux8u_norm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = (value >> 16) & 0xff;
         int32_t g = ((int32_t)(value << 16) ) >> 24;
         int32_t r = ((int32_t)(value << 24) ) >> 24;
         dst[0] = _mesa_snorm_to_unorm(MAX2(r, 0), 8, 8); /* r */
         dst[1] = _mesa_snorm_to_unorm(MAX2(g, 0), 8, 8); /* g */
         dst[2] = b; /* b */
         dst[3] = 255; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r = ((int32_t)(value << 24) ) >> 24;
         int32_t g = ((int32_t)(value << 16) ) >> 24;
         uint32_t b = (value >> 16) & 0xff;
         dst[0] = _mesa_snorm_to_unorm(MAX2(r, 0), 8, 8); /* r */
         dst[1] = _mesa_snorm_to_unorm(MAX2(g, 0), 8, 8); /* g */
         dst[2] = b; /* b */
         dst[3] = 255; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_r8sg8sb8ux8u_norm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((src[2]) & 0xff) << 16;
         value |= (uint32_t)((uint32_t)((_mesa_unorm_to_snorm(src[1], 8, 8)) & 0xff) << 8) ;
         value |= (uint32_t)((_mesa_unorm_to_snorm(src[0], 8, 8)) & 0xff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)((_mesa_unorm_to_snorm(src[0], 8, 8)) & 0xff) ;
         value |= (uint32_t)((uint32_t)((_mesa_unorm_to_snorm(src[1], 8, 8)) & 0xff) << 8) ;
         value |= (uint32_t)((src[2]) & 0xff) << 16;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r10sg10sb10sa2u_norm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = value >> 30;
         int32_t b = ((int32_t)(value << 2) ) >> 22;
         int32_t g = ((int32_t)(value << 12) ) >> 22;
         int32_t r = ((int32_t)(value << 22) ) >> 22;
         dst[0] = (float)(r * (1.0f/0x1ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x1ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x1ff)); /* b */
         dst[3] = (float)(a * (1.0f/0x3)); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r = ((int32_t)(value << 22) ) >> 22;
         int32_t g = ((int32_t)(value << 12) ) >> 22;
         int32_t b = ((int32_t)(value << 2) ) >> 22;
         uint32_t a = value >> 30;
         dst[0] = (float)(r * (1.0f/0x1ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x1ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x1ff)); /* b */
         dst[3] = (float)(a * (1.0f/0x3)); /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_r10sg10sb10sa2u_norm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0x3)) << 30;
         value |= (uint32_t)((uint32_t)(((uint32_t)util_iround(CLAMP(src[2], -1.0f, 1.0f) * 0x1ff)) & 0x3ff) << 20) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0x1ff)) & 0x3ff) << 10) ;
         value |= (uint32_t)(((uint32_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x1ff)) & 0x3ff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((uint32_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x1ff)) & 0x3ff) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0x1ff)) & 0x3ff) << 10) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)util_iround(CLAMP(src[2], -1.0f, 1.0f) * 0x1ff)) & 0x3ff) << 20) ;
         value |= (uint32_t)((uint32_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0x3)) << 30;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r10sg10sb10sa2u_norm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = value >> 30;
         int32_t b = ((int32_t)(value << 2) ) >> 22;
         int32_t g = ((int32_t)(value << 12) ) >> 22;
         int32_t r = ((int32_t)(value << 22) ) >> 22;
         dst[0] = (float)(r * (1.0f/0x1ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x1ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x1ff)); /* b */
         dst[3] = (float)(a * (1.0f/0x3)); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r = ((int32_t)(value << 22) ) >> 22;
         int32_t g = ((int32_t)(value << 12) ) >> 22;
         int32_t b = ((int32_t)(value << 2) ) >> 22;
         uint32_t a = value >> 30;
         dst[0] = (float)(r * (1.0f/0x1ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x1ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x1ff)); /* b */
         dst[3] = (float)(a * (1.0f/0x3)); /* a */
#endif
}

void
util_format_r10sg10sb10sa2u_norm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = value >> 30;
         int32_t b = ((int32_t)(value << 2) ) >> 22;
         int32_t g = ((int32_t)(value << 12) ) >> 22;
         int32_t r = ((int32_t)(value << 22) ) >> 22;
         dst[0] = _mesa_snorm_to_unorm(MAX2(r, 0), 10, 8); /* r */
         dst[1] = _mesa_snorm_to_unorm(MAX2(g, 0), 10, 8); /* g */
         dst[2] = _mesa_snorm_to_unorm(MAX2(b, 0), 10, 8); /* b */
         dst[3] = _mesa_unorm_to_unorm(a, 2, 8); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r = ((int32_t)(value << 22) ) >> 22;
         int32_t g = ((int32_t)(value << 12) ) >> 22;
         int32_t b = ((int32_t)(value << 2) ) >> 22;
         uint32_t a = value >> 30;
         dst[0] = _mesa_snorm_to_unorm(MAX2(r, 0), 10, 8); /* r */
         dst[1] = _mesa_snorm_to_unorm(MAX2(g, 0), 10, 8); /* g */
         dst[2] = _mesa_snorm_to_unorm(MAX2(b, 0), 10, 8); /* b */
         dst[3] = _mesa_unorm_to_unorm(a, 2, 8); /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_r10sg10sb10sa2u_norm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(_mesa_unorm_to_unorm(src[3], 8, 2)) << 30;
         value |= (uint32_t)((uint32_t)((_mesa_unorm_to_snorm(src[2], 8, 10)) & 0x3ff) << 20) ;
         value |= (uint32_t)((uint32_t)((_mesa_unorm_to_snorm(src[1], 8, 10)) & 0x3ff) << 10) ;
         value |= (uint32_t)((_mesa_unorm_to_snorm(src[0], 8, 10)) & 0x3ff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)((_mesa_unorm_to_snorm(src[0], 8, 10)) & 0x3ff) ;
         value |= (uint32_t)((uint32_t)((_mesa_unorm_to_snorm(src[1], 8, 10)) & 0x3ff) << 10) ;
         value |= (uint32_t)((uint32_t)((_mesa_unorm_to_snorm(src[2], 8, 10)) & 0x3ff) << 20) ;
         value |= (uint32_t)(_mesa_unorm_to_unorm(src[3], 8, 2)) << 30;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r5sg5sb6u_norm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t b = value >> 10;
         int16_t g = ((int16_t)(value << 6) ) >> 11;
         int16_t r = ((int16_t)(value << 11) ) >> 11;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = (float)(g * (1.0f/0xf)); /* g */
         dst[2] = (float)(b * (1.0f/0x3f)); /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         int16_t r = ((int16_t)(value << 11) ) >> 11;
         int16_t g = ((int16_t)(value << 6) ) >> 11;
         uint16_t b = value >> 10;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = (float)(g * (1.0f/0xf)); /* g */
         dst[2] = (float)(b * (1.0f/0x3f)); /* b */
         dst[3] = 1; /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_r5sg5sb6u_norm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x3f)) << 10;
         value |= (uint16_t)((uint32_t)(((uint16_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0xf)) & 0x1f) << 5) ;
         value |= (uint16_t)(((uint16_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0xf)) & 0x1f) ;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (uint16_t)(((uint16_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0xf)) & 0x1f) ;
         value |= (uint16_t)((uint32_t)(((uint16_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0xf)) & 0x1f) << 5) ;
         value |= (uint32_t)((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x3f)) << 10;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r5sg5sb6u_norm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t b = value >> 10;
         int16_t g = ((int16_t)(value << 6) ) >> 11;
         int16_t r = ((int16_t)(value << 11) ) >> 11;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = (float)(g * (1.0f/0xf)); /* g */
         dst[2] = (float)(b * (1.0f/0x3f)); /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         int16_t r = ((int16_t)(value << 11) ) >> 11;
         int16_t g = ((int16_t)(value << 6) ) >> 11;
         uint16_t b = value >> 10;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = (float)(g * (1.0f/0xf)); /* g */
         dst[2] = (float)(b * (1.0f/0x3f)); /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_r5sg5sb6u_norm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t b = value >> 10;
         int16_t g = ((int16_t)(value << 6) ) >> 11;
         int16_t r = ((int16_t)(value << 11) ) >> 11;
         dst[0] = _mesa_snorm_to_unorm(MAX2(r, 0), 5, 8); /* r */
         dst[1] = _mesa_snorm_to_unorm(MAX2(g, 0), 5, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(b, 6, 8); /* b */
         dst[3] = 255; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         int16_t r = ((int16_t)(value << 11) ) >> 11;
         int16_t g = ((int16_t)(value << 6) ) >> 11;
         uint16_t b = value >> 10;
         dst[0] = _mesa_snorm_to_unorm(MAX2(r, 0), 5, 8); /* r */
         dst[1] = _mesa_snorm_to_unorm(MAX2(g, 0), 5, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(b, 6, 8); /* b */
         dst[3] = 255; /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_r5sg5sb6u_norm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)(_mesa_unorm_to_unorm(src[2], 8, 6)) << 10;
         value |= (uint16_t)((uint32_t)((_mesa_unorm_to_snorm(src[1], 8, 5)) & 0x1f) << 5) ;
         value |= (uint16_t)((_mesa_unorm_to_snorm(src[0], 8, 5)) & 0x1f) ;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (uint16_t)((_mesa_unorm_to_snorm(src[0], 8, 5)) & 0x1f) ;
         value |= (uint16_t)((uint32_t)((_mesa_unorm_to_snorm(src[1], 8, 5)) & 0x1f) << 5) ;
         value |= (uint32_t)(_mesa_unorm_to_unorm(src[2], 8, 6)) << 10;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_z24_unorm_s8_uint_as_r8g8b8a8_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = value >> 24;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t b = (value >> 8) & 0xff;
         uint32_t a = (value) & 0xff;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = (value) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t b = (value >> 16) & 0xff;
         uint32_t a = value >> 24;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_z24_unorm_s8_uint_as_r8g8b8a8_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(float_to_ubyte(src[0])) << 24;
         value |= (uint32_t)((float_to_ubyte(src[1])) & 0xff) << 16;
         value |= (uint32_t)((float_to_ubyte(src[2])) & 0xff) << 8;
         value |= (float_to_ubyte(src[3])) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (float_to_ubyte(src[0])) & 0xff;
         value |= (uint32_t)((float_to_ubyte(src[1])) & 0xff) << 8;
         value |= (uint32_t)((float_to_ubyte(src[2])) & 0xff) << 16;
         value |= (uint32_t)(float_to_ubyte(src[3])) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_z24_unorm_s8_uint_as_r8g8b8a8_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = value >> 24;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t b = (value >> 8) & 0xff;
         uint32_t a = (value) & 0xff;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = (value) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t b = (value >> 16) & 0xff;
         uint32_t a = value >> 24;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#endif
}

void
util_format_z24_unorm_s8_uint_as_r8g8b8a8_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = value >> 24;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t b = (value >> 8) & 0xff;
         uint32_t a = (value) & 0xff;
         dst[0] = r; /* r */
         dst[1] = g; /* g */
         dst[2] = b; /* b */
         dst[3] = a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = (value) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t b = (value >> 16) & 0xff;
         uint32_t a = value >> 24;
         dst[0] = r; /* r */
         dst[1] = g; /* g */
         dst[2] = b; /* b */
         dst[3] = a; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_z24_unorm_s8_uint_as_r8g8b8a8_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(src[0]) << 24;
         value |= (uint32_t)((src[1]) & 0xff) << 16;
         value |= (uint32_t)((src[2]) & 0xff) << 8;
         value |= (src[3]) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (src[0]) & 0xff;
         value |= (uint32_t)((src[1]) & 0xff) << 8;
         value |= (uint32_t)((src[2]) & 0xff) << 16;
         value |= (uint32_t)(src[3]) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r64_float {
   double r;
};

void
util_format_r64_float_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         struct util_format_r64_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
      src += 8;
      dst += 4;
   }
}

void
util_format_r64_float_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_r64_float pixel = {0};
         pixel.r = (double)src[0];
         memcpy(dst, &pixel, sizeof pixel);
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r64_float_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
         struct util_format_r64_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
}

void
util_format_r64_float_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         struct util_format_r64_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)util_iround(CLAMP(pixel.r, 0.0, 1.0) * 0xff); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
      src += 8;
      dst += 4;
   }
}

void
util_format_r64_float_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_r64_float pixel = {0};
         pixel.r = (double)(src[0] * (1.0f/0xff));
         memcpy(dst, &pixel, sizeof pixel);
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r64g64_float {
#if UTIL_ARCH_BIG_ENDIAN
   double r;
   double g;
#else
   double r;
   double g;
#endif
};

void
util_format_r64g64_float_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r64g64_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r64g64_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
      src += 16;
      dst += 4;
   }
}

void
util_format_r64g64_float_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r64g64_float pixel = {0};
         pixel.r = (double)src[0];
         pixel.g = (double)src[1];
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r64g64_float pixel = {0};
         pixel.r = (double)src[0];
         pixel.g = (double)src[1];
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 16;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r64g64_float_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r64g64_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r64g64_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_r64g64_float_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r64g64_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)util_iround(CLAMP(pixel.r, 0.0, 1.0) * 0xff); /* r */
         dst[1] = (uint8_t)util_iround(CLAMP(pixel.g, 0.0, 1.0) * 0xff); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#else
         struct util_format_r64g64_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)util_iround(CLAMP(pixel.r, 0.0, 1.0) * 0xff); /* r */
         dst[1] = (uint8_t)util_iround(CLAMP(pixel.g, 0.0, 1.0) * 0xff); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#endif
      src += 16;
      dst += 4;
   }
}

void
util_format_r64g64_float_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r64g64_float pixel = {0};
         pixel.r = (double)(src[0] * (1.0f/0xff));
         pixel.g = (double)(src[1] * (1.0f/0xff));
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r64g64_float pixel = {0};
         pixel.r = (double)(src[0] * (1.0f/0xff));
         pixel.g = (double)(src[1] * (1.0f/0xff));
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 16;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r64g64b64_float {
#if UTIL_ARCH_BIG_ENDIAN
   double r;
   double g;
   double b;
#else
   double r;
   double g;
   double b;
#endif
};

void
util_format_r64g64b64_float_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r64g64b64_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r64g64b64_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
      src += 24;
      dst += 4;
   }
}

void
util_format_r64g64b64_float_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r64g64b64_float pixel = {0};
         pixel.r = (double)src[0];
         pixel.g = (double)src[1];
         pixel.b = (double)src[2];
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r64g64b64_float pixel = {0};
         pixel.r = (double)src[0];
         pixel.g = (double)src[1];
         pixel.b = (double)src[2];
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 24;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r64g64b64_float_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r64g64b64_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r64g64b64_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_r64g64b64_float_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r64g64b64_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)util_iround(CLAMP(pixel.r, 0.0, 1.0) * 0xff); /* r */
         dst[1] = (uint8_t)util_iround(CLAMP(pixel.g, 0.0, 1.0) * 0xff); /* g */
         dst[2] = (uint8_t)util_iround(CLAMP(pixel.b, 0.0, 1.0) * 0xff); /* b */
         dst[3] = 255; /* a */
#else
         struct util_format_r64g64b64_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)util_iround(CLAMP(pixel.r, 0.0, 1.0) * 0xff); /* r */
         dst[1] = (uint8_t)util_iround(CLAMP(pixel.g, 0.0, 1.0) * 0xff); /* g */
         dst[2] = (uint8_t)util_iround(CLAMP(pixel.b, 0.0, 1.0) * 0xff); /* b */
         dst[3] = 255; /* a */
#endif
      src += 24;
      dst += 4;
   }
}

void
util_format_r64g64b64_float_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r64g64b64_float pixel = {0};
         pixel.r = (double)(src[0] * (1.0f/0xff));
         pixel.g = (double)(src[1] * (1.0f/0xff));
         pixel.b = (double)(src[2] * (1.0f/0xff));
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r64g64b64_float pixel = {0};
         pixel.r = (double)(src[0] * (1.0f/0xff));
         pixel.g = (double)(src[1] * (1.0f/0xff));
         pixel.b = (double)(src[2] * (1.0f/0xff));
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 24;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r64g64b64a64_float {
#if UTIL_ARCH_BIG_ENDIAN
   double r;
   double g;
   double b;
   double a;
#else
   double r;
   double g;
   double b;
   double a;
#endif
};

void
util_format_r64g64b64a64_float_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r64g64b64a64_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = (float)pixel.a; /* a */
#else
         struct util_format_r64g64b64a64_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = (float)pixel.a; /* a */
#endif
      src += 32;
      dst += 4;
   }
}

void
util_format_r64g64b64a64_float_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r64g64b64a64_float pixel = {0};
         pixel.r = (double)src[0];
         pixel.g = (double)src[1];
         pixel.b = (double)src[2];
         pixel.a = (double)src[3];
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r64g64b64a64_float pixel = {0};
         pixel.r = (double)src[0];
         pixel.g = (double)src[1];
         pixel.b = (double)src[2];
         pixel.a = (double)src[3];
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 32;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r64g64b64a64_float_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r64g64b64a64_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = (float)pixel.a; /* a */
#else
         struct util_format_r64g64b64a64_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = (float)pixel.a; /* a */
#endif
}

void
util_format_r64g64b64a64_float_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r64g64b64a64_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)util_iround(CLAMP(pixel.r, 0.0, 1.0) * 0xff); /* r */
         dst[1] = (uint8_t)util_iround(CLAMP(pixel.g, 0.0, 1.0) * 0xff); /* g */
         dst[2] = (uint8_t)util_iround(CLAMP(pixel.b, 0.0, 1.0) * 0xff); /* b */
         dst[3] = (uint8_t)util_iround(CLAMP(pixel.a, 0.0, 1.0) * 0xff); /* a */
#else
         struct util_format_r64g64b64a64_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)util_iround(CLAMP(pixel.r, 0.0, 1.0) * 0xff); /* r */
         dst[1] = (uint8_t)util_iround(CLAMP(pixel.g, 0.0, 1.0) * 0xff); /* g */
         dst[2] = (uint8_t)util_iround(CLAMP(pixel.b, 0.0, 1.0) * 0xff); /* b */
         dst[3] = (uint8_t)util_iround(CLAMP(pixel.a, 0.0, 1.0) * 0xff); /* a */
#endif
      src += 32;
      dst += 4;
   }
}

void
util_format_r64g64b64a64_float_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r64g64b64a64_float pixel = {0};
         pixel.r = (double)(src[0] * (1.0f/0xff));
         pixel.g = (double)(src[1] * (1.0f/0xff));
         pixel.b = (double)(src[2] * (1.0f/0xff));
         pixel.a = (double)(src[3] * (1.0f/0xff));
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r64g64b64a64_float pixel = {0};
         pixel.r = (double)(src[0] * (1.0f/0xff));
         pixel.g = (double)(src[1] * (1.0f/0xff));
         pixel.b = (double)(src[2] * (1.0f/0xff));
         pixel.a = (double)(src[3] * (1.0f/0xff));
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 32;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r32_float {
   float r;
};

void
util_format_r32_float_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         struct util_format_r32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
      src += 4;
      dst += 4;
   }
}

void
util_format_r32_float_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_r32_float pixel = {0};
         pixel.r = src[0];
         memcpy(dst, &pixel, sizeof pixel);
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r32_float_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
         struct util_format_r32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
}

void
util_format_r32_float_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         struct util_format_r32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = float_to_ubyte(pixel.r); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
      src += 4;
      dst += 4;
   }
}

void
util_format_r32_float_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_r32_float pixel = {0};
         pixel.r = ubyte_to_float(src[0]);
         memcpy(dst, &pixel, sizeof pixel);
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r32g32_float {
#if UTIL_ARCH_BIG_ENDIAN
   float r;
   float g;
#else
   float r;
   float g;
#endif
};

void
util_format_r32g32_float_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
      src += 8;
      dst += 4;
   }
}

void
util_format_r32g32_float_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_float pixel = {0};
         pixel.r = src[0];
         pixel.g = src[1];
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32_float pixel = {0};
         pixel.r = src[0];
         pixel.g = src[1];
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r32g32_float_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_r32g32_float_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = float_to_ubyte(pixel.r); /* r */
         dst[1] = float_to_ubyte(pixel.g); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#else
         struct util_format_r32g32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = float_to_ubyte(pixel.r); /* r */
         dst[1] = float_to_ubyte(pixel.g); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#endif
      src += 8;
      dst += 4;
   }
}

void
util_format_r32g32_float_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_float pixel = {0};
         pixel.r = ubyte_to_float(src[0]);
         pixel.g = ubyte_to_float(src[1]);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32_float pixel = {0};
         pixel.r = ubyte_to_float(src[0]);
         pixel.g = ubyte_to_float(src[1]);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r32g32b32_float {
#if UTIL_ARCH_BIG_ENDIAN
   float r;
   float g;
   float b;
#else
   float r;
   float g;
   float b;
#endif
};

void
util_format_r32g32b32_float_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32b32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
      src += 12;
      dst += 4;
   }
}

void
util_format_r32g32b32_float_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_float pixel = {0};
         pixel.r = src[0];
         pixel.g = src[1];
         pixel.b = src[2];
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32_float pixel = {0};
         pixel.r = src[0];
         pixel.g = src[1];
         pixel.b = src[2];
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 12;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r32g32b32_float_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32b32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_r32g32b32_float_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = float_to_ubyte(pixel.r); /* r */
         dst[1] = float_to_ubyte(pixel.g); /* g */
         dst[2] = float_to_ubyte(pixel.b); /* b */
         dst[3] = 255; /* a */
#else
         struct util_format_r32g32b32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = float_to_ubyte(pixel.r); /* r */
         dst[1] = float_to_ubyte(pixel.g); /* g */
         dst[2] = float_to_ubyte(pixel.b); /* b */
         dst[3] = 255; /* a */
#endif
      src += 12;
      dst += 4;
   }
}

void
util_format_r32g32b32_float_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_float pixel = {0};
         pixel.r = ubyte_to_float(src[0]);
         pixel.g = ubyte_to_float(src[1]);
         pixel.b = ubyte_to_float(src[2]);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32_float pixel = {0};
         pixel.r = ubyte_to_float(src[0]);
         pixel.g = ubyte_to_float(src[1]);
         pixel.b = ubyte_to_float(src[2]);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 12;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r32g32b32a32_float {
#if UTIL_ARCH_BIG_ENDIAN
   float r;
   float g;
   float b;
   float a;
#else
   float r;
   float g;
   float b;
   float a;
#endif
};

void
util_format_r32g32b32a32_float_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = pixel.b; /* b */
         dst[3] = pixel.a; /* a */
#else
         struct util_format_r32g32b32a32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = pixel.b; /* b */
         dst[3] = pixel.a; /* a */
#endif
      src += 16;
      dst += 4;
   }
}

void
util_format_r32g32b32a32_float_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_float pixel = {0};
         pixel.r = src[0];
         pixel.g = src[1];
         pixel.b = src[2];
         pixel.a = src[3];
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32a32_float pixel = {0};
         pixel.r = src[0];
         pixel.g = src[1];
         pixel.b = src[2];
         pixel.a = src[3];
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 16;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r32g32b32a32_float_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = pixel.b; /* b */
         dst[3] = pixel.a; /* a */
#else
         struct util_format_r32g32b32a32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = pixel.b; /* b */
         dst[3] = pixel.a; /* a */
#endif
}

void
util_format_r32g32b32a32_float_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = float_to_ubyte(pixel.r); /* r */
         dst[1] = float_to_ubyte(pixel.g); /* g */
         dst[2] = float_to_ubyte(pixel.b); /* b */
         dst[3] = float_to_ubyte(pixel.a); /* a */
#else
         struct util_format_r32g32b32a32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = float_to_ubyte(pixel.r); /* r */
         dst[1] = float_to_ubyte(pixel.g); /* g */
         dst[2] = float_to_ubyte(pixel.b); /* b */
         dst[3] = float_to_ubyte(pixel.a); /* a */
#endif
      src += 16;
      dst += 4;
   }
}

void
util_format_r32g32b32a32_float_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_float pixel = {0};
         pixel.r = ubyte_to_float(src[0]);
         pixel.g = ubyte_to_float(src[1]);
         pixel.b = ubyte_to_float(src[2]);
         pixel.a = ubyte_to_float(src[3]);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32a32_float pixel = {0};
         pixel.r = ubyte_to_float(src[0]);
         pixel.g = ubyte_to_float(src[1]);
         pixel.b = ubyte_to_float(src[2]);
         pixel.a = ubyte_to_float(src[3]);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 16;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r32_unorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = value;
         dst[0] = (float)(r * (1.0/0xffffffff)); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
      src += 4;
      dst += 4;
   }
}

void
util_format_r32_unorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = 0;
         value |= (uint32_t)(CLAMP(src[0], 0.0f, 1.0f) * (double)0xffffffff);
         *(uint32_t *)dst = value;
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r32_unorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = value;
         dst[0] = (float)(r * (1.0/0xffffffff)); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
}

void
util_format_r32_unorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = value;
         dst[0] = _mesa_unorm_to_unorm(r, 32, 8); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
      src += 4;
      dst += 4;
   }
}

void
util_format_r32_unorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = 0;
         value |= _mesa_unorm_to_unorm(src[0], 8, 32);
         *(uint32_t *)dst = value;
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r32g32_unorm {
#if UTIL_ARCH_BIG_ENDIAN
   uint32_t r;
   uint32_t g;
#else
   uint32_t r;
   uint32_t g;
#endif
};

void
util_format_r32g32_unorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0xffffffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0xffffffff)); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0xffffffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0xffffffff)); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
      src += 8;
      dst += 4;
   }
}

void
util_format_r32g32_unorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_unorm pixel = {0};
         pixel.r = (uint32_t)(CLAMP(src[0], 0.0f, 1.0f) * (double)0xffffffff);
         pixel.g = (uint32_t)(CLAMP(src[1], 0.0f, 1.0f) * (double)0xffffffff);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32_unorm pixel = {0};
         pixel.r = (uint32_t)(CLAMP(src[0], 0.0f, 1.0f) * (double)0xffffffff);
         pixel.g = (uint32_t)(CLAMP(src[1], 0.0f, 1.0f) * (double)0xffffffff);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r32g32_unorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0xffffffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0xffffffff)); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0xffffffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0xffffffff)); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_r32g32_unorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = _mesa_unorm_to_unorm(pixel.r, 32, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(pixel.g, 32, 8); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#else
         struct util_format_r32g32_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = _mesa_unorm_to_unorm(pixel.r, 32, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(pixel.g, 32, 8); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#endif
      src += 8;
      dst += 4;
   }
}

void
util_format_r32g32_unorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_unorm pixel = {0};
         pixel.r = _mesa_unorm_to_unorm(src[0], 8, 32);
         pixel.g = _mesa_unorm_to_unorm(src[1], 8, 32);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32_unorm pixel = {0};
         pixel.r = _mesa_unorm_to_unorm(src[0], 8, 32);
         pixel.g = _mesa_unorm_to_unorm(src[1], 8, 32);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r32g32b32_unorm {
#if UTIL_ARCH_BIG_ENDIAN
   uint32_t r;
   uint32_t g;
   uint32_t b;
#else
   uint32_t r;
   uint32_t g;
   uint32_t b;
#endif
};

void
util_format_r32g32b32_unorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0xffffffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0xffffffff)); /* g */
         dst[2] = (float)(pixel.b * (1.0/0xffffffff)); /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32b32_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0xffffffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0xffffffff)); /* g */
         dst[2] = (float)(pixel.b * (1.0/0xffffffff)); /* b */
         dst[3] = 1; /* a */
#endif
      src += 12;
      dst += 4;
   }
}

void
util_format_r32g32b32_unorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_unorm pixel = {0};
         pixel.r = (uint32_t)(CLAMP(src[0], 0.0f, 1.0f) * (double)0xffffffff);
         pixel.g = (uint32_t)(CLAMP(src[1], 0.0f, 1.0f) * (double)0xffffffff);
         pixel.b = (uint32_t)(CLAMP(src[2], 0.0f, 1.0f) * (double)0xffffffff);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32_unorm pixel = {0};
         pixel.r = (uint32_t)(CLAMP(src[0], 0.0f, 1.0f) * (double)0xffffffff);
         pixel.g = (uint32_t)(CLAMP(src[1], 0.0f, 1.0f) * (double)0xffffffff);
         pixel.b = (uint32_t)(CLAMP(src[2], 0.0f, 1.0f) * (double)0xffffffff);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 12;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r32g32b32_unorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0xffffffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0xffffffff)); /* g */
         dst[2] = (float)(pixel.b * (1.0/0xffffffff)); /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32b32_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0xffffffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0xffffffff)); /* g */
         dst[2] = (float)(pixel.b * (1.0/0xffffffff)); /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_r32g32b32_unorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = _mesa_unorm_to_unorm(pixel.r, 32, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(pixel.g, 32, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(pixel.b, 32, 8); /* b */
         dst[3] = 255; /* a */
#else
         struct util_format_r32g32b32_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = _mesa_unorm_to_unorm(pixel.r, 32, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(pixel.g, 32, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(pixel.b, 32, 8); /* b */
         dst[3] = 255; /* a */
#endif
      src += 12;
      dst += 4;
   }
}

void
util_format_r32g32b32_unorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_unorm pixel = {0};
         pixel.r = _mesa_unorm_to_unorm(src[0], 8, 32);
         pixel.g = _mesa_unorm_to_unorm(src[1], 8, 32);
         pixel.b = _mesa_unorm_to_unorm(src[2], 8, 32);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32_unorm pixel = {0};
         pixel.r = _mesa_unorm_to_unorm(src[0], 8, 32);
         pixel.g = _mesa_unorm_to_unorm(src[1], 8, 32);
         pixel.b = _mesa_unorm_to_unorm(src[2], 8, 32);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 12;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r32g32b32a32_unorm {
#if UTIL_ARCH_BIG_ENDIAN
   uint32_t r;
   uint32_t g;
   uint32_t b;
   uint32_t a;
#else
   uint32_t r;
   uint32_t g;
   uint32_t b;
   uint32_t a;
#endif
};

void
util_format_r32g32b32a32_unorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0xffffffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0xffffffff)); /* g */
         dst[2] = (float)(pixel.b * (1.0/0xffffffff)); /* b */
         dst[3] = (float)(pixel.a * (1.0/0xffffffff)); /* a */
#else
         struct util_format_r32g32b32a32_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0xffffffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0xffffffff)); /* g */
         dst[2] = (float)(pixel.b * (1.0/0xffffffff)); /* b */
         dst[3] = (float)(pixel.a * (1.0/0xffffffff)); /* a */
#endif
      src += 16;
      dst += 4;
   }
}

void
util_format_r32g32b32a32_unorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_unorm pixel = {0};
         pixel.r = (uint32_t)(CLAMP(src[0], 0.0f, 1.0f) * (double)0xffffffff);
         pixel.g = (uint32_t)(CLAMP(src[1], 0.0f, 1.0f) * (double)0xffffffff);
         pixel.b = (uint32_t)(CLAMP(src[2], 0.0f, 1.0f) * (double)0xffffffff);
         pixel.a = (uint32_t)(CLAMP(src[3], 0.0f, 1.0f) * (double)0xffffffff);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32a32_unorm pixel = {0};
         pixel.r = (uint32_t)(CLAMP(src[0], 0.0f, 1.0f) * (double)0xffffffff);
         pixel.g = (uint32_t)(CLAMP(src[1], 0.0f, 1.0f) * (double)0xffffffff);
         pixel.b = (uint32_t)(CLAMP(src[2], 0.0f, 1.0f) * (double)0xffffffff);
         pixel.a = (uint32_t)(CLAMP(src[3], 0.0f, 1.0f) * (double)0xffffffff);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 16;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r32g32b32a32_unorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0xffffffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0xffffffff)); /* g */
         dst[2] = (float)(pixel.b * (1.0/0xffffffff)); /* b */
         dst[3] = (float)(pixel.a * (1.0/0xffffffff)); /* a */
#else
         struct util_format_r32g32b32a32_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0xffffffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0xffffffff)); /* g */
         dst[2] = (float)(pixel.b * (1.0/0xffffffff)); /* b */
         dst[3] = (float)(pixel.a * (1.0/0xffffffff)); /* a */
#endif
}

void
util_format_r32g32b32a32_unorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = _mesa_unorm_to_unorm(pixel.r, 32, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(pixel.g, 32, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(pixel.b, 32, 8); /* b */
         dst[3] = _mesa_unorm_to_unorm(pixel.a, 32, 8); /* a */
#else
         struct util_format_r32g32b32a32_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = _mesa_unorm_to_unorm(pixel.r, 32, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(pixel.g, 32, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(pixel.b, 32, 8); /* b */
         dst[3] = _mesa_unorm_to_unorm(pixel.a, 32, 8); /* a */
#endif
      src += 16;
      dst += 4;
   }
}

void
util_format_r32g32b32a32_unorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_unorm pixel = {0};
         pixel.r = _mesa_unorm_to_unorm(src[0], 8, 32);
         pixel.g = _mesa_unorm_to_unorm(src[1], 8, 32);
         pixel.b = _mesa_unorm_to_unorm(src[2], 8, 32);
         pixel.a = _mesa_unorm_to_unorm(src[3], 8, 32);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32a32_unorm pixel = {0};
         pixel.r = _mesa_unorm_to_unorm(src[0], 8, 32);
         pixel.g = _mesa_unorm_to_unorm(src[1], 8, 32);
         pixel.b = _mesa_unorm_to_unorm(src[2], 8, 32);
         pixel.a = _mesa_unorm_to_unorm(src[3], 8, 32);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 16;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r32_uscaled_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = value;
         dst[0] = (float)r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
      src += 4;
      dst += 4;
   }
}

void
util_format_r32_uscaled_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = 0;
         value |= (uint32_t)CLAMP(src[0], 0.0f, 4294967040.0f);
         *(uint32_t *)dst = value;
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r32_uscaled_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = value;
         dst[0] = (float)r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
}

void
util_format_r32_uscaled_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = value;
         dst[0] = (uint8_t)(((uint64_t)MIN2(r, 1)) * 0xff / 0x1); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
      src += 4;
      dst += 4;
   }
}

void
util_format_r32_uscaled_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = 0;
         value |= (uint32_t)(((uint64_t)src[0]) * 0x1 / 0xff);
         *(uint32_t *)dst = value;
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r32g32_uscaled {
#if UTIL_ARCH_BIG_ENDIAN
   uint32_t r;
   uint32_t g;
#else
   uint32_t r;
   uint32_t g;
#endif
};

void
util_format_r32g32_uscaled_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
      src += 8;
      dst += 4;
   }
}

void
util_format_r32g32_uscaled_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_uscaled pixel = {0};
         pixel.r = (uint32_t)CLAMP(src[0], 0.0f, 4294967040.0f);
         pixel.g = (uint32_t)CLAMP(src[1], 0.0f, 4294967040.0f);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32_uscaled pixel = {0};
         pixel.r = (uint32_t)CLAMP(src[0], 0.0f, 4294967040.0f);
         pixel.g = (uint32_t)CLAMP(src[1], 0.0f, 4294967040.0f);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r32g32_uscaled_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_r32g32_uscaled_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(((uint64_t)MIN2(pixel.r, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint64_t)MIN2(pixel.g, 1)) * 0xff / 0x1); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#else
         struct util_format_r32g32_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(((uint64_t)MIN2(pixel.r, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint64_t)MIN2(pixel.g, 1)) * 0xff / 0x1); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#endif
      src += 8;
      dst += 4;
   }
}

void
util_format_r32g32_uscaled_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_uscaled pixel = {0};
         pixel.r = (uint32_t)(((uint64_t)src[0]) * 0x1 / 0xff);
         pixel.g = (uint32_t)(((uint64_t)src[1]) * 0x1 / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32_uscaled pixel = {0};
         pixel.r = (uint32_t)(((uint64_t)src[0]) * 0x1 / 0xff);
         pixel.g = (uint32_t)(((uint64_t)src[1]) * 0x1 / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r32g32b32_uscaled {
#if UTIL_ARCH_BIG_ENDIAN
   uint32_t r;
   uint32_t g;
   uint32_t b;
#else
   uint32_t r;
   uint32_t g;
   uint32_t b;
#endif
};

void
util_format_r32g32b32_uscaled_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32b32_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
      src += 12;
      dst += 4;
   }
}

void
util_format_r32g32b32_uscaled_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_uscaled pixel = {0};
         pixel.r = (uint32_t)CLAMP(src[0], 0.0f, 4294967040.0f);
         pixel.g = (uint32_t)CLAMP(src[1], 0.0f, 4294967040.0f);
         pixel.b = (uint32_t)CLAMP(src[2], 0.0f, 4294967040.0f);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32_uscaled pixel = {0};
         pixel.r = (uint32_t)CLAMP(src[0], 0.0f, 4294967040.0f);
         pixel.g = (uint32_t)CLAMP(src[1], 0.0f, 4294967040.0f);
         pixel.b = (uint32_t)CLAMP(src[2], 0.0f, 4294967040.0f);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 12;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r32g32b32_uscaled_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32b32_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_r32g32b32_uscaled_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(((uint64_t)MIN2(pixel.r, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint64_t)MIN2(pixel.g, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint64_t)MIN2(pixel.b, 1)) * 0xff / 0x1); /* b */
         dst[3] = 255; /* a */
#else
         struct util_format_r32g32b32_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(((uint64_t)MIN2(pixel.r, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint64_t)MIN2(pixel.g, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint64_t)MIN2(pixel.b, 1)) * 0xff / 0x1); /* b */
         dst[3] = 255; /* a */
#endif
      src += 12;
      dst += 4;
   }
}

void
util_format_r32g32b32_uscaled_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_uscaled pixel = {0};
         pixel.r = (uint32_t)(((uint64_t)src[0]) * 0x1 / 0xff);
         pixel.g = (uint32_t)(((uint64_t)src[1]) * 0x1 / 0xff);
         pixel.b = (uint32_t)(((uint64_t)src[2]) * 0x1 / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32_uscaled pixel = {0};
         pixel.r = (uint32_t)(((uint64_t)src[0]) * 0x1 / 0xff);
         pixel.g = (uint32_t)(((uint64_t)src[1]) * 0x1 / 0xff);
         pixel.b = (uint32_t)(((uint64_t)src[2]) * 0x1 / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 12;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r32g32b32a32_uscaled {
#if UTIL_ARCH_BIG_ENDIAN
   uint32_t r;
   uint32_t g;
   uint32_t b;
   uint32_t a;
#else
   uint32_t r;
   uint32_t g;
   uint32_t b;
   uint32_t a;
#endif
};

void
util_format_r32g32b32a32_uscaled_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = (float)pixel.a; /* a */
#else
         struct util_format_r32g32b32a32_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = (float)pixel.a; /* a */
#endif
      src += 16;
      dst += 4;
   }
}

void
util_format_r32g32b32a32_uscaled_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_uscaled pixel = {0};
         pixel.r = (uint32_t)CLAMP(src[0], 0.0f, 4294967040.0f);
         pixel.g = (uint32_t)CLAMP(src[1], 0.0f, 4294967040.0f);
         pixel.b = (uint32_t)CLAMP(src[2], 0.0f, 4294967040.0f);
         pixel.a = (uint32_t)CLAMP(src[3], 0.0f, 4294967040.0f);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32a32_uscaled pixel = {0};
         pixel.r = (uint32_t)CLAMP(src[0], 0.0f, 4294967040.0f);
         pixel.g = (uint32_t)CLAMP(src[1], 0.0f, 4294967040.0f);
         pixel.b = (uint32_t)CLAMP(src[2], 0.0f, 4294967040.0f);
         pixel.a = (uint32_t)CLAMP(src[3], 0.0f, 4294967040.0f);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 16;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r32g32b32a32_uscaled_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = (float)pixel.a; /* a */
#else
         struct util_format_r32g32b32a32_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = (float)pixel.a; /* a */
#endif
}

void
util_format_r32g32b32a32_uscaled_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(((uint64_t)MIN2(pixel.r, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint64_t)MIN2(pixel.g, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint64_t)MIN2(pixel.b, 1)) * 0xff / 0x1); /* b */
         dst[3] = (uint8_t)(((uint64_t)MIN2(pixel.a, 1)) * 0xff / 0x1); /* a */
#else
         struct util_format_r32g32b32a32_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(((uint64_t)MIN2(pixel.r, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint64_t)MIN2(pixel.g, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint64_t)MIN2(pixel.b, 1)) * 0xff / 0x1); /* b */
         dst[3] = (uint8_t)(((uint64_t)MIN2(pixel.a, 1)) * 0xff / 0x1); /* a */
#endif
      src += 16;
      dst += 4;
   }
}

void
util_format_r32g32b32a32_uscaled_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_uscaled pixel = {0};
         pixel.r = (uint32_t)(((uint64_t)src[0]) * 0x1 / 0xff);
         pixel.g = (uint32_t)(((uint64_t)src[1]) * 0x1 / 0xff);
         pixel.b = (uint32_t)(((uint64_t)src[2]) * 0x1 / 0xff);
         pixel.a = (uint32_t)(((uint64_t)src[3]) * 0x1 / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32a32_uscaled pixel = {0};
         pixel.r = (uint32_t)(((uint64_t)src[0]) * 0x1 / 0xff);
         pixel.g = (uint32_t)(((uint64_t)src[1]) * 0x1 / 0xff);
         pixel.b = (uint32_t)(((uint64_t)src[2]) * 0x1 / 0xff);
         pixel.a = (uint32_t)(((uint64_t)src[3]) * 0x1 / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 16;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r32_snorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint32_t value = *(const uint32_t *)src;
         int32_t r = (int32_t)(value) ;
         dst[0] = (float)(r * (1.0/0x7fffffff)); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
      src += 4;
      dst += 4;
   }
}

void
util_format_r32_snorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = 0;
         value |= (uint32_t)((int32_t)(CLAMP(src[0], -1.0f, 1.0f) * (double)0x7fffffff)) ;
         *(uint32_t *)dst = value;
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r32_snorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
         uint32_t value = *(const uint32_t *)src;
         int32_t r = (int32_t)(value) ;
         dst[0] = (float)(r * (1.0/0x7fffffff)); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
}

void
util_format_r32_snorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint32_t value = *(const uint32_t *)src;
         int32_t r = (int32_t)(value) ;
         dst[0] = _mesa_snorm_to_unorm(MAX2(r, 0), 32, 8); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
      src += 4;
      dst += 4;
   }
}

void
util_format_r32_snorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = 0;
         value |= (uint32_t)(_mesa_unorm_to_snorm(src[0], 8, 32)) ;
         *(uint32_t *)dst = value;
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r32g32_snorm {
#if UTIL_ARCH_BIG_ENDIAN
   int32_t r;
   int32_t g;
#else
   int32_t r;
   int32_t g;
#endif
};

void
util_format_r32g32_snorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0x7fffffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0x7fffffff)); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0x7fffffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0x7fffffff)); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
      src += 8;
      dst += 4;
   }
}

void
util_format_r32g32_snorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_snorm pixel = {0};
         pixel.r = (int32_t)(CLAMP(src[0], -1.0f, 1.0f) * (double)0x7fffffff);
         pixel.g = (int32_t)(CLAMP(src[1], -1.0f, 1.0f) * (double)0x7fffffff);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32_snorm pixel = {0};
         pixel.r = (int32_t)(CLAMP(src[0], -1.0f, 1.0f) * (double)0x7fffffff);
         pixel.g = (int32_t)(CLAMP(src[1], -1.0f, 1.0f) * (double)0x7fffffff);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r32g32_snorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0x7fffffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0x7fffffff)); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0x7fffffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0x7fffffff)); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_r32g32_snorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = _mesa_snorm_to_unorm(MAX2(pixel.r, 0), 32, 8); /* r */
         dst[1] = _mesa_snorm_to_unorm(MAX2(pixel.g, 0), 32, 8); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#else
         struct util_format_r32g32_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = _mesa_snorm_to_unorm(MAX2(pixel.r, 0), 32, 8); /* r */
         dst[1] = _mesa_snorm_to_unorm(MAX2(pixel.g, 0), 32, 8); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#endif
      src += 8;
      dst += 4;
   }
}

void
util_format_r32g32_snorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_snorm pixel = {0};
         pixel.r = _mesa_unorm_to_snorm(src[0], 8, 32);
         pixel.g = _mesa_unorm_to_snorm(src[1], 8, 32);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32_snorm pixel = {0};
         pixel.r = _mesa_unorm_to_snorm(src[0], 8, 32);
         pixel.g = _mesa_unorm_to_snorm(src[1], 8, 32);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r32g32b32_snorm {
#if UTIL_ARCH_BIG_ENDIAN
   int32_t r;
   int32_t g;
   int32_t b;
#else
   int32_t r;
   int32_t g;
   int32_t b;
#endif
};

void
util_format_r32g32b32_snorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0x7fffffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0x7fffffff)); /* g */
         dst[2] = (float)(pixel.b * (1.0/0x7fffffff)); /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32b32_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0x7fffffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0x7fffffff)); /* g */
         dst[2] = (float)(pixel.b * (1.0/0x7fffffff)); /* b */
         dst[3] = 1; /* a */
#endif
      src += 12;
      dst += 4;
   }
}

void
util_format_r32g32b32_snorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_snorm pixel = {0};
         pixel.r = (int32_t)(CLAMP(src[0], -1.0f, 1.0f) * (double)0x7fffffff);
         pixel.g = (int32_t)(CLAMP(src[1], -1.0f, 1.0f) * (double)0x7fffffff);
         pixel.b = (int32_t)(CLAMP(src[2], -1.0f, 1.0f) * (double)0x7fffffff);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32_snorm pixel = {0};
         pixel.r = (int32_t)(CLAMP(src[0], -1.0f, 1.0f) * (double)0x7fffffff);
         pixel.g = (int32_t)(CLAMP(src[1], -1.0f, 1.0f) * (double)0x7fffffff);
         pixel.b = (int32_t)(CLAMP(src[2], -1.0f, 1.0f) * (double)0x7fffffff);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 12;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r32g32b32_snorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0x7fffffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0x7fffffff)); /* g */
         dst[2] = (float)(pixel.b * (1.0/0x7fffffff)); /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32b32_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0x7fffffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0x7fffffff)); /* g */
         dst[2] = (float)(pixel.b * (1.0/0x7fffffff)); /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_r32g32b32_snorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = _mesa_snorm_to_unorm(MAX2(pixel.r, 0), 32, 8); /* r */
         dst[1] = _mesa_snorm_to_unorm(MAX2(pixel.g, 0), 32, 8); /* g */
         dst[2] = _mesa_snorm_to_unorm(MAX2(pixel.b, 0), 32, 8); /* b */
         dst[3] = 255; /* a */
#else
         struct util_format_r32g32b32_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = _mesa_snorm_to_unorm(MAX2(pixel.r, 0), 32, 8); /* r */
         dst[1] = _mesa_snorm_to_unorm(MAX2(pixel.g, 0), 32, 8); /* g */
         dst[2] = _mesa_snorm_to_unorm(MAX2(pixel.b, 0), 32, 8); /* b */
         dst[3] = 255; /* a */
#endif
      src += 12;
      dst += 4;
   }
}

void
util_format_r32g32b32_snorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_snorm pixel = {0};
         pixel.r = _mesa_unorm_to_snorm(src[0], 8, 32);
         pixel.g = _mesa_unorm_to_snorm(src[1], 8, 32);
         pixel.b = _mesa_unorm_to_snorm(src[2], 8, 32);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32_snorm pixel = {0};
         pixel.r = _mesa_unorm_to_snorm(src[0], 8, 32);
         pixel.g = _mesa_unorm_to_snorm(src[1], 8, 32);
         pixel.b = _mesa_unorm_to_snorm(src[2], 8, 32);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 12;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r32g32b32a32_snorm {
#if UTIL_ARCH_BIG_ENDIAN
   int32_t r;
   int32_t g;
   int32_t b;
   int32_t a;
#else
   int32_t r;
   int32_t g;
   int32_t b;
   int32_t a;
#endif
};

void
util_format_r32g32b32a32_snorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0x7fffffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0x7fffffff)); /* g */
         dst[2] = (float)(pixel.b * (1.0/0x7fffffff)); /* b */
         dst[3] = (float)(pixel.a * (1.0/0x7fffffff)); /* a */
#else
         struct util_format_r32g32b32a32_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0x7fffffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0x7fffffff)); /* g */
         dst[2] = (float)(pixel.b * (1.0/0x7fffffff)); /* b */
         dst[3] = (float)(pixel.a * (1.0/0x7fffffff)); /* a */
#endif
      src += 16;
      dst += 4;
   }
}

void
util_format_r32g32b32a32_snorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_snorm pixel = {0};
         pixel.r = (int32_t)(CLAMP(src[0], -1.0f, 1.0f) * (double)0x7fffffff);
         pixel.g = (int32_t)(CLAMP(src[1], -1.0f, 1.0f) * (double)0x7fffffff);
         pixel.b = (int32_t)(CLAMP(src[2], -1.0f, 1.0f) * (double)0x7fffffff);
         pixel.a = (int32_t)(CLAMP(src[3], -1.0f, 1.0f) * (double)0x7fffffff);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32a32_snorm pixel = {0};
         pixel.r = (int32_t)(CLAMP(src[0], -1.0f, 1.0f) * (double)0x7fffffff);
         pixel.g = (int32_t)(CLAMP(src[1], -1.0f, 1.0f) * (double)0x7fffffff);
         pixel.b = (int32_t)(CLAMP(src[2], -1.0f, 1.0f) * (double)0x7fffffff);
         pixel.a = (int32_t)(CLAMP(src[3], -1.0f, 1.0f) * (double)0x7fffffff);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 16;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r32g32b32a32_snorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0x7fffffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0x7fffffff)); /* g */
         dst[2] = (float)(pixel.b * (1.0/0x7fffffff)); /* b */
         dst[3] = (float)(pixel.a * (1.0/0x7fffffff)); /* a */
#else
         struct util_format_r32g32b32a32_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0x7fffffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0x7fffffff)); /* g */
         dst[2] = (float)(pixel.b * (1.0/0x7fffffff)); /* b */
         dst[3] = (float)(pixel.a * (1.0/0x7fffffff)); /* a */
#endif
}

void
util_format_r32g32b32a32_snorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = _mesa_snorm_to_unorm(MAX2(pixel.r, 0), 32, 8); /* r */
         dst[1] = _mesa_snorm_to_unorm(MAX2(pixel.g, 0), 32, 8); /* g */
         dst[2] = _mesa_snorm_to_unorm(MAX2(pixel.b, 0), 32, 8); /* b */
         dst[3] = _mesa_snorm_to_unorm(MAX2(pixel.a, 0), 32, 8); /* a */
#else
         struct util_format_r32g32b32a32_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = _mesa_snorm_to_unorm(MAX2(pixel.r, 0), 32, 8); /* r */
         dst[1] = _mesa_snorm_to_unorm(MAX2(pixel.g, 0), 32, 8); /* g */
         dst[2] = _mesa_snorm_to_unorm(MAX2(pixel.b, 0), 32, 8); /* b */
         dst[3] = _mesa_snorm_to_unorm(MAX2(pixel.a, 0), 32, 8); /* a */
#endif
      src += 16;
      dst += 4;
   }
}

void
util_format_r32g32b32a32_snorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_snorm pixel = {0};
         pixel.r = _mesa_unorm_to_snorm(src[0], 8, 32);
         pixel.g = _mesa_unorm_to_snorm(src[1], 8, 32);
         pixel.b = _mesa_unorm_to_snorm(src[2], 8, 32);
         pixel.a = _mesa_unorm_to_snorm(src[3], 8, 32);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32a32_snorm pixel = {0};
         pixel.r = _mesa_unorm_to_snorm(src[0], 8, 32);
         pixel.g = _mesa_unorm_to_snorm(src[1], 8, 32);
         pixel.b = _mesa_unorm_to_snorm(src[2], 8, 32);
         pixel.a = _mesa_unorm_to_snorm(src[3], 8, 32);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 16;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r32_sscaled_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint32_t value = *(const uint32_t *)src;
         int32_t r = (int32_t)(value) ;
         dst[0] = (float)r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
      src += 4;
      dst += 4;
   }
}

void
util_format_r32_sscaled_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = 0;
         value |= (uint32_t)((int32_t)CLAMP(src[0], -2147483648.0f, 2147483520.0f)) ;
         *(uint32_t *)dst = value;
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r32_sscaled_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
         uint32_t value = *(const uint32_t *)src;
         int32_t r = (int32_t)(value) ;
         dst[0] = (float)r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
}

void
util_format_r32_sscaled_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint32_t value = *(const uint32_t *)src;
         int32_t r = (int32_t)(value) ;
         dst[0] = (uint8_t)(((uint64_t)CLAMP(r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
      src += 4;
      dst += 4;
   }
}

void
util_format_r32_sscaled_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = 0;
         value |= (uint32_t)((int32_t)(((uint64_t)src[0]) * 0x1 / 0xff)) ;
         *(uint32_t *)dst = value;
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r32g32_sscaled {
#if UTIL_ARCH_BIG_ENDIAN
   int32_t r;
   int32_t g;
#else
   int32_t r;
   int32_t g;
#endif
};

void
util_format_r32g32_sscaled_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
      src += 8;
      dst += 4;
   }
}

void
util_format_r32g32_sscaled_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_sscaled pixel = {0};
         pixel.r = (int32_t)CLAMP(src[0], -2147483648.0f, 2147483520.0f);
         pixel.g = (int32_t)CLAMP(src[1], -2147483648.0f, 2147483520.0f);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32_sscaled pixel = {0};
         pixel.r = (int32_t)CLAMP(src[0], -2147483648.0f, 2147483520.0f);
         pixel.g = (int32_t)CLAMP(src[1], -2147483648.0f, 2147483520.0f);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r32g32_sscaled_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_r32g32_sscaled_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(((uint64_t)CLAMP(pixel.r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint64_t)CLAMP(pixel.g, 0, 1)) * 0xff / 0x1); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#else
         struct util_format_r32g32_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(((uint64_t)CLAMP(pixel.r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint64_t)CLAMP(pixel.g, 0, 1)) * 0xff / 0x1); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#endif
      src += 8;
      dst += 4;
   }
}

void
util_format_r32g32_sscaled_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_sscaled pixel = {0};
         pixel.r = (int32_t)(((uint64_t)src[0]) * 0x1 / 0xff);
         pixel.g = (int32_t)(((uint64_t)src[1]) * 0x1 / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32_sscaled pixel = {0};
         pixel.r = (int32_t)(((uint64_t)src[0]) * 0x1 / 0xff);
         pixel.g = (int32_t)(((uint64_t)src[1]) * 0x1 / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r32g32b32_sscaled {
#if UTIL_ARCH_BIG_ENDIAN
   int32_t r;
   int32_t g;
   int32_t b;
#else
   int32_t r;
   int32_t g;
   int32_t b;
#endif
};

void
util_format_r32g32b32_sscaled_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32b32_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
      src += 12;
      dst += 4;
   }
}

void
util_format_r32g32b32_sscaled_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_sscaled pixel = {0};
         pixel.r = (int32_t)CLAMP(src[0], -2147483648.0f, 2147483520.0f);
         pixel.g = (int32_t)CLAMP(src[1], -2147483648.0f, 2147483520.0f);
         pixel.b = (int32_t)CLAMP(src[2], -2147483648.0f, 2147483520.0f);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32_sscaled pixel = {0};
         pixel.r = (int32_t)CLAMP(src[0], -2147483648.0f, 2147483520.0f);
         pixel.g = (int32_t)CLAMP(src[1], -2147483648.0f, 2147483520.0f);
         pixel.b = (int32_t)CLAMP(src[2], -2147483648.0f, 2147483520.0f);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 12;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r32g32b32_sscaled_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32b32_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_r32g32b32_sscaled_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(((uint64_t)CLAMP(pixel.r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint64_t)CLAMP(pixel.g, 0, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint64_t)CLAMP(pixel.b, 0, 1)) * 0xff / 0x1); /* b */
         dst[3] = 255; /* a */
#else
         struct util_format_r32g32b32_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(((uint64_t)CLAMP(pixel.r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint64_t)CLAMP(pixel.g, 0, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint64_t)CLAMP(pixel.b, 0, 1)) * 0xff / 0x1); /* b */
         dst[3] = 255; /* a */
#endif
      src += 12;
      dst += 4;
   }
}

void
util_format_r32g32b32_sscaled_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_sscaled pixel = {0};
         pixel.r = (int32_t)(((uint64_t)src[0]) * 0x1 / 0xff);
         pixel.g = (int32_t)(((uint64_t)src[1]) * 0x1 / 0xff);
         pixel.b = (int32_t)(((uint64_t)src[2]) * 0x1 / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32_sscaled pixel = {0};
         pixel.r = (int32_t)(((uint64_t)src[0]) * 0x1 / 0xff);
         pixel.g = (int32_t)(((uint64_t)src[1]) * 0x1 / 0xff);
         pixel.b = (int32_t)(((uint64_t)src[2]) * 0x1 / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 12;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r32g32b32a32_sscaled {
#if UTIL_ARCH_BIG_ENDIAN
   int32_t r;
   int32_t g;
   int32_t b;
   int32_t a;
#else
   int32_t r;
   int32_t g;
   int32_t b;
   int32_t a;
#endif
};

void
util_format_r32g32b32a32_sscaled_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = (float)pixel.a; /* a */
#else
         struct util_format_r32g32b32a32_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = (float)pixel.a; /* a */
#endif
      src += 16;
      dst += 4;
   }
}

void
util_format_r32g32b32a32_sscaled_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_sscaled pixel = {0};
         pixel.r = (int32_t)CLAMP(src[0], -2147483648.0f, 2147483520.0f);
         pixel.g = (int32_t)CLAMP(src[1], -2147483648.0f, 2147483520.0f);
         pixel.b = (int32_t)CLAMP(src[2], -2147483648.0f, 2147483520.0f);
         pixel.a = (int32_t)CLAMP(src[3], -2147483648.0f, 2147483520.0f);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32a32_sscaled pixel = {0};
         pixel.r = (int32_t)CLAMP(src[0], -2147483648.0f, 2147483520.0f);
         pixel.g = (int32_t)CLAMP(src[1], -2147483648.0f, 2147483520.0f);
         pixel.b = (int32_t)CLAMP(src[2], -2147483648.0f, 2147483520.0f);
         pixel.a = (int32_t)CLAMP(src[3], -2147483648.0f, 2147483520.0f);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 16;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r32g32b32a32_sscaled_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = (float)pixel.a; /* a */
#else
         struct util_format_r32g32b32a32_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = (float)pixel.a; /* a */
#endif
}

void
util_format_r32g32b32a32_sscaled_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(((uint64_t)CLAMP(pixel.r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint64_t)CLAMP(pixel.g, 0, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint64_t)CLAMP(pixel.b, 0, 1)) * 0xff / 0x1); /* b */
         dst[3] = (uint8_t)(((uint64_t)CLAMP(pixel.a, 0, 1)) * 0xff / 0x1); /* a */
#else
         struct util_format_r32g32b32a32_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(((uint64_t)CLAMP(pixel.r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint64_t)CLAMP(pixel.g, 0, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint64_t)CLAMP(pixel.b, 0, 1)) * 0xff / 0x1); /* b */
         dst[3] = (uint8_t)(((uint64_t)CLAMP(pixel.a, 0, 1)) * 0xff / 0x1); /* a */
#endif
      src += 16;
      dst += 4;
   }
}

void
util_format_r32g32b32a32_sscaled_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_sscaled pixel = {0};
         pixel.r = (int32_t)(((uint64_t)src[0]) * 0x1 / 0xff);
         pixel.g = (int32_t)(((uint64_t)src[1]) * 0x1 / 0xff);
         pixel.b = (int32_t)(((uint64_t)src[2]) * 0x1 / 0xff);
         pixel.a = (int32_t)(((uint64_t)src[3]) * 0x1 / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32a32_sscaled pixel = {0};
         pixel.r = (int32_t)(((uint64_t)src[0]) * 0x1 / 0xff);
         pixel.g = (int32_t)(((uint64_t)src[1]) * 0x1 / 0xff);
         pixel.b = (int32_t)(((uint64_t)src[2]) * 0x1 / 0xff);
         pixel.a = (int32_t)(((uint64_t)src[3]) * 0x1 / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 16;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r16_float {
   uint16_t r;
};

void
util_format_r16_float_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         struct util_format_r16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = _mesa_half_to_float(pixel.r); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
      src += 2;
      dst += 4;
   }
}

void
util_format_r16_float_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_r16_float pixel = {0};
         pixel.r = _mesa_float_to_float16_rtz(src[0]);
         memcpy(dst, &pixel, sizeof pixel);
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r16_float_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
         struct util_format_r16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = _mesa_half_to_float(pixel.r); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
}

void
util_format_r16_float_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         struct util_format_r16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = float_to_ubyte(_mesa_half_to_float(pixel.r)); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
      src += 2;
      dst += 4;
   }
}

void
util_format_r16_float_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_r16_float pixel = {0};
         pixel.r = _mesa_float_to_float16_rtz((float)(src[0] * (1.0f/0xff)));
         memcpy(dst, &pixel, sizeof pixel);
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r16g16_float {
#if UTIL_ARCH_BIG_ENDIAN
   uint16_t r;
   uint16_t g;
#else
   uint16_t r;
   uint16_t g;
#endif
};

void
util_format_r16g16_float_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = _mesa_half_to_float(pixel.r); /* r */
         dst[1] = _mesa_half_to_float(pixel.g); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r16g16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = _mesa_half_to_float(pixel.r); /* r */
         dst[1] = _mesa_half_to_float(pixel.g); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_r16g16_float_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16_float pixel = {0};
         pixel.r = _mesa_float_to_float16_rtz(src[0]);
         pixel.g = _mesa_float_to_float16_rtz(src[1]);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r16g16_float pixel = {0};
         pixel.r = _mesa_float_to_float16_rtz(src[0]);
         pixel.g = _mesa_float_to_float16_rtz(src[1]);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r16g16_float_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = _mesa_half_to_float(pixel.r); /* r */
         dst[1] = _mesa_half_to_float(pixel.g); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r16g16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = _mesa_half_to_float(pixel.r); /* r */
         dst[1] = _mesa_half_to_float(pixel.g); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_r16g16_float_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = float_to_ubyte(_mesa_half_to_float(pixel.r)); /* r */
         dst[1] = float_to_ubyte(_mesa_half_to_float(pixel.g)); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#else
         struct util_format_r16g16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = float_to_ubyte(_mesa_half_to_float(pixel.r)); /* r */
         dst[1] = float_to_ubyte(_mesa_half_to_float(pixel.g)); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_r16g16_float_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16_float pixel = {0};
         pixel.r = _mesa_float_to_float16_rtz((float)(src[0] * (1.0f/0xff)));
         pixel.g = _mesa_float_to_float16_rtz((float)(src[1] * (1.0f/0xff)));
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r16g16_float pixel = {0};
         pixel.r = _mesa_float_to_float16_rtz((float)(src[0] * (1.0f/0xff)));
         pixel.g = _mesa_float_to_float16_rtz((float)(src[1] * (1.0f/0xff)));
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r16g16b16_float {
#if UTIL_ARCH_BIG_ENDIAN
   uint16_t r;
   uint16_t g;
   uint16_t b;
#else
   uint16_t r;
   uint16_t g;
   uint16_t b;
#endif
};

void
util_format_r16g16b16_float_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = _mesa_half_to_float(pixel.r); /* r */
         dst[1] = _mesa_half_to_float(pixel.g); /* g */
         dst[2] = _mesa_half_to_float(pixel.b); /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r16g16b16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = _mesa_half_to_float(pixel.r); /* r */
         dst[1] = _mesa_half_to_float(pixel.g); /* g */
         dst[2] = _mesa_half_to_float(pixel.b); /* b */
         dst[3] = 1; /* a */
#endif
      src += 6;
      dst += 4;
   }
}

void
util_format_r16g16b16_float_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_float pixel = {0};
         pixel.r = _mesa_float_to_float16_rtz(src[0]);
         pixel.g = _mesa_float_to_float16_rtz(src[1]);
         pixel.b = _mesa_float_to_float16_rtz(src[2]);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r16g16b16_float pixel = {0};
         pixel.r = _mesa_float_to_float16_rtz(src[0]);
         pixel.g = _mesa_float_to_float16_rtz(src[1]);
         pixel.b = _mesa_float_to_float16_rtz(src[2]);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 6;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r16g16b16_float_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = _mesa_half_to_float(pixel.r); /* r */
         dst[1] = _mesa_half_to_float(pixel.g); /* g */
         dst[2] = _mesa_half_to_float(pixel.b); /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r16g16b16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = _mesa_half_to_float(pixel.r); /* r */
         dst[1] = _mesa_half_to_float(pixel.g); /* g */
         dst[2] = _mesa_half_to_float(pixel.b); /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_r16g16b16_float_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = float_to_ubyte(_mesa_half_to_float(pixel.r)); /* r */
         dst[1] = float_to_ubyte(_mesa_half_to_float(pixel.g)); /* g */
         dst[2] = float_to_ubyte(_mesa_half_to_float(pixel.b)); /* b */
         dst[3] = 255; /* a */
#else
         struct util_format_r16g16b16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = float_to_ubyte(_mesa_half_to_float(pixel.r)); /* r */
         dst[1] = float_to_ubyte(_mesa_half_to_float(pixel.g)); /* g */
         dst[2] = float_to_ubyte(_mesa_half_to_float(pixel.b)); /* b */
         dst[3] = 255; /* a */
#endif
      src += 6;
      dst += 4;
   }
}

void
util_format_r16g16b16_float_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_float pixel = {0};
         pixel.r = _mesa_float_to_float16_rtz((float)(src[0] * (1.0f/0xff)));
         pixel.g = _mesa_float_to_float16_rtz((float)(src[1] * (1.0f/0xff)));
         pixel.b = _mesa_float_to_float16_rtz((float)(src[2] * (1.0f/0xff)));
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r16g16b16_float pixel = {0};
         pixel.r = _mesa_float_to_float16_rtz((float)(src[0] * (1.0f/0xff)));
         pixel.g = _mesa_float_to_float16_rtz((float)(src[1] * (1.0f/0xff)));
         pixel.b = _mesa_float_to_float16_rtz((float)(src[2] * (1.0f/0xff)));
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 6;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r16g16b16a16_float {
#if UTIL_ARCH_BIG_ENDIAN
   uint16_t r;
   uint16_t g;
   uint16_t b;
   uint16_t a;
#else
   uint16_t r;
   uint16_t g;
   uint16_t b;
   uint16_t a;
#endif
};

void
util_format_r16g16b16a16_float_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = _mesa_half_to_float(pixel.r); /* r */
         dst[1] = _mesa_half_to_float(pixel.g); /* g */
         dst[2] = _mesa_half_to_float(pixel.b); /* b */
         dst[3] = _mesa_half_to_float(pixel.a); /* a */
#else
         struct util_format_r16g16b16a16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = _mesa_half_to_float(pixel.r); /* r */
         dst[1] = _mesa_half_to_float(pixel.g); /* g */
         dst[2] = _mesa_half_to_float(pixel.b); /* b */
         dst[3] = _mesa_half_to_float(pixel.a); /* a */
#endif
      src += 8;
      dst += 4;
   }
}

void
util_format_r16g16b16a16_float_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_float pixel = {0};
         pixel.r = _mesa_float_to_float16_rtz(src[0]);
         pixel.g = _mesa_float_to_float16_rtz(src[1]);
         pixel.b = _mesa_float_to_float16_rtz(src[2]);
         pixel.a = _mesa_float_to_float16_rtz(src[3]);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r16g16b16a16_float pixel = {0};
         pixel.r = _mesa_float_to_float16_rtz(src[0]);
         pixel.g = _mesa_float_to_float16_rtz(src[1]);
         pixel.b = _mesa_float_to_float16_rtz(src[2]);
         pixel.a = _mesa_float_to_float16_rtz(src[3]);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r16g16b16a16_float_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = _mesa_half_to_float(pixel.r); /* r */
         dst[1] = _mesa_half_to_float(pixel.g); /* g */
         dst[2] = _mesa_half_to_float(pixel.b); /* b */
         dst[3] = _mesa_half_to_float(pixel.a); /* a */
#else
         struct util_format_r16g16b16a16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = _mesa_half_to_float(pixel.r); /* r */
         dst[1] = _mesa_half_to_float(pixel.g); /* g */
         dst[2] = _mesa_half_to_float(pixel.b); /* b */
         dst[3] = _mesa_half_to_float(pixel.a); /* a */
#endif
}

void
util_format_r16g16b16a16_float_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = float_to_ubyte(_mesa_half_to_float(pixel.r)); /* r */
         dst[1] = float_to_ubyte(_mesa_half_to_float(pixel.g)); /* g */
         dst[2] = float_to_ubyte(_mesa_half_to_float(pixel.b)); /* b */
         dst[3] = float_to_ubyte(_mesa_half_to_float(pixel.a)); /* a */
#else
         struct util_format_r16g16b16a16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = float_to_ubyte(_mesa_half_to_float(pixel.r)); /* r */
         dst[1] = float_to_ubyte(_mesa_half_to_float(pixel.g)); /* g */
         dst[2] = float_to_ubyte(_mesa_half_to_float(pixel.b)); /* b */
         dst[3] = float_to_ubyte(_mesa_half_to_float(pixel.a)); /* a */
#endif
      src += 8;
      dst += 4;
   }
}

void
util_format_r16g16b16a16_float_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_float pixel = {0};
         pixel.r = _mesa_float_to_float16_rtz((float)(src[0] * (1.0f/0xff)));
         pixel.g = _mesa_float_to_float16_rtz((float)(src[1] * (1.0f/0xff)));
         pixel.b = _mesa_float_to_float16_rtz((float)(src[2] * (1.0f/0xff)));
         pixel.a = _mesa_float_to_float16_rtz((float)(src[3] * (1.0f/0xff)));
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r16g16b16a16_float pixel = {0};
         pixel.r = _mesa_float_to_float16_rtz((float)(src[0] * (1.0f/0xff)));
         pixel.g = _mesa_float_to_float16_rtz((float)(src[1] * (1.0f/0xff)));
         pixel.b = _mesa_float_to_float16_rtz((float)(src[2] * (1.0f/0xff)));
         pixel.a = _mesa_float_to_float16_rtz((float)(src[3] * (1.0f/0xff)));
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r16_unorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = value;
         dst[0] = (float)(r * (1.0f/0xffff)); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
      src += 2;
      dst += 4;
   }
}

void
util_format_r16_unorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0xffff);
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r16_unorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = value;
         dst[0] = (float)(r * (1.0f/0xffff)); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
}

void
util_format_r16_unorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = value;
         dst[0] = _mesa_unorm_to_unorm(r, 16, 8); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
      src += 2;
      dst += 4;
   }
}

void
util_format_r16_unorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= _mesa_unorm_to_unorm(src[0], 8, 16);
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r16g16_unorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = value >> 16;
         uint32_t g = (value) & 0xffff;
         dst[0] = (float)(r * (1.0f/0xffff)); /* r */
         dst[1] = (float)(g * (1.0f/0xffff)); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = (value) & 0xffff;
         uint32_t g = value >> 16;
         dst[0] = (float)(r * (1.0f/0xffff)); /* r */
         dst[1] = (float)(g * (1.0f/0xffff)); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_r16g16_unorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0xffff)) << 16;
         value |= ((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0xffff)) & 0xffff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0xffff)) & 0xffff;
         value |= (uint32_t)((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0xffff)) << 16;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r16g16_unorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = value >> 16;
         uint32_t g = (value) & 0xffff;
         dst[0] = (float)(r * (1.0f/0xffff)); /* r */
         dst[1] = (float)(g * (1.0f/0xffff)); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = (value) & 0xffff;
         uint32_t g = value >> 16;
         dst[0] = (float)(r * (1.0f/0xffff)); /* r */
         dst[1] = (float)(g * (1.0f/0xffff)); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_r16g16_unorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = value >> 16;
         uint32_t g = (value) & 0xffff;
         dst[0] = _mesa_unorm_to_unorm(r, 16, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(g, 16, 8); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = (value) & 0xffff;
         uint32_t g = value >> 16;
         dst[0] = _mesa_unorm_to_unorm(r, 16, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(g, 16, 8); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_r16g16_unorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(_mesa_unorm_to_unorm(src[0], 8, 16)) << 16;
         value |= (_mesa_unorm_to_unorm(src[1], 8, 16)) & 0xffff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (_mesa_unorm_to_unorm(src[0], 8, 16)) & 0xffff;
         value |= (uint32_t)(_mesa_unorm_to_unorm(src[1], 8, 16)) << 16;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r16g16b16_unorm {
#if UTIL_ARCH_BIG_ENDIAN
   uint16_t r;
   uint16_t g;
   uint16_t b;
#else
   uint16_t r;
   uint16_t g;
   uint16_t b;
#endif
};

void
util_format_r16g16b16_unorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0f/0xffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0f/0xffff)); /* g */
         dst[2] = (float)(pixel.b * (1.0f/0xffff)); /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r16g16b16_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0f/0xffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0f/0xffff)); /* g */
         dst[2] = (float)(pixel.b * (1.0f/0xffff)); /* b */
         dst[3] = 1; /* a */
#endif
      src += 6;
      dst += 4;
   }
}

void
util_format_r16g16b16_unorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_unorm pixel = {0};
         pixel.r = (uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0xffff);
         pixel.g = (uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0xffff);
         pixel.b = (uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0xffff);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r16g16b16_unorm pixel = {0};
         pixel.r = (uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0xffff);
         pixel.g = (uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0xffff);
         pixel.b = (uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0xffff);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 6;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r16g16b16_unorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0f/0xffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0f/0xffff)); /* g */
         dst[2] = (float)(pixel.b * (1.0f/0xffff)); /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r16g16b16_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0f/0xffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0f/0xffff)); /* g */
         dst[2] = (float)(pixel.b * (1.0f/0xffff)); /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_r16g16b16_unorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = _mesa_unorm_to_unorm(pixel.r, 16, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(pixel.g, 16, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(pixel.b, 16, 8); /* b */
         dst[3] = 255; /* a */
#else
         struct util_format_r16g16b16_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = _mesa_unorm_to_unorm(pixel.r, 16, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(pixel.g, 16, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(pixel.b, 16, 8); /* b */
         dst[3] = 255; /* a */
#endif
      src += 6;
      dst += 4;
   }
}

void
util_format_r16g16b16_unorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_unorm pixel = {0};
         pixel.r = _mesa_unorm_to_unorm(src[0], 8, 16);
         pixel.g = _mesa_unorm_to_unorm(src[1], 8, 16);
         pixel.b = _mesa_unorm_to_unorm(src[2], 8, 16);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r16g16b16_unorm pixel = {0};
         pixel.r = _mesa_unorm_to_unorm(src[0], 8, 16);
         pixel.g = _mesa_unorm_to_unorm(src[1], 8, 16);
         pixel.b = _mesa_unorm_to_unorm(src[2], 8, 16);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 6;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r16g16b16a16_unorm {
#if UTIL_ARCH_BIG_ENDIAN
   uint16_t r;
   uint16_t g;
   uint16_t b;
   uint16_t a;
#else
   uint16_t r;
   uint16_t g;
   uint16_t b;
   uint16_t a;
#endif
};

void
util_format_r16g16b16a16_unorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0f/0xffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0f/0xffff)); /* g */
         dst[2] = (float)(pixel.b * (1.0f/0xffff)); /* b */
         dst[3] = (float)(pixel.a * (1.0f/0xffff)); /* a */
#else
         struct util_format_r16g16b16a16_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0f/0xffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0f/0xffff)); /* g */
         dst[2] = (float)(pixel.b * (1.0f/0xffff)); /* b */
         dst[3] = (float)(pixel.a * (1.0f/0xffff)); /* a */
#endif
      src += 8;
      dst += 4;
   }
}

void
util_format_r16g16b16a16_unorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_unorm pixel = {0};
         pixel.r = (uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0xffff);
         pixel.g = (uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0xffff);
         pixel.b = (uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0xffff);
         pixel.a = (uint16_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0xffff);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r16g16b16a16_unorm pixel = {0};
         pixel.r = (uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0xffff);
         pixel.g = (uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0xffff);
         pixel.b = (uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0xffff);
         pixel.a = (uint16_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0xffff);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r16g16b16a16_unorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0f/0xffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0f/0xffff)); /* g */
         dst[2] = (float)(pixel.b * (1.0f/0xffff)); /* b */
         dst[3] = (float)(pixel.a * (1.0f/0xffff)); /* a */
#else
         struct util_format_r16g16b16a16_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0f/0xffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0f/0xffff)); /* g */
         dst[2] = (float)(pixel.b * (1.0f/0xffff)); /* b */
         dst[3] = (float)(pixel.a * (1.0f/0xffff)); /* a */
#endif
}

void
util_format_r16g16b16a16_unorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = _mesa_unorm_to_unorm(pixel.r, 16, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(pixel.g, 16, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(pixel.b, 16, 8); /* b */
         dst[3] = _mesa_unorm_to_unorm(pixel.a, 16, 8); /* a */
#else
         struct util_format_r16g16b16a16_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = _mesa_unorm_to_unorm(pixel.r, 16, 8); /* r */
         dst[1] = _mesa_unorm_to_unorm(pixel.g, 16, 8); /* g */
         dst[2] = _mesa_unorm_to_unorm(pixel.b, 16, 8); /* b */
         dst[3] = _mesa_unorm_to_unorm(pixel.a, 16, 8); /* a */
#endif
      src += 8;
      dst += 4;
   }
}

void
util_format_r16g16b16a16_unorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_unorm pixel = {0};
         pixel.r = _mesa_unorm_to_unorm(src[0], 8, 16);
         pixel.g = _mesa_unorm_to_unorm(src[1], 8, 16);
         pixel.b = _mesa_unorm_to_unorm(src[2], 8, 16);
         pixel.a = _mesa_unorm_to_unorm(src[3], 8, 16);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r16g16b16a16_unorm pixel = {0};
         pixel.r = _mesa_unorm_to_unorm(src[0], 8, 16);
         pixel.g = _mesa_unorm_to_unorm(src[1], 8, 16);
         pixel.b = _mesa_unorm_to_unorm(src[2], 8, 16);
         pixel.a = _mesa_unorm_to_unorm(src[3], 8, 16);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r16_uscaled_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = value;
         dst[0] = (float)r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
      src += 2;
      dst += 4;
   }
}

void
util_format_r16_uscaled_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)CLAMP(src[0], 0.0f, 65535.0f);
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r16_uscaled_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = value;
         dst[0] = (float)r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
}

void
util_format_r16_uscaled_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = value;
         dst[0] = (uint8_t)(((uint32_t)MIN2(r, 1)) * 0xff / 0x1); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
      src += 2;
      dst += 4;
   }
}

void
util_format_r16_uscaled_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)(((uint32_t)src[0]) * 0x1 / 0xff);
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r16g16_uscaled_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = value >> 16;
         uint32_t g = (value) & 0xffff;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = (value) & 0xffff;
         uint32_t g = value >> 16;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_r16g16_uscaled_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint16_t)CLAMP(src[0], 0.0f, 65535.0f)) << 16;
         value |= ((uint16_t)CLAMP(src[1], 0.0f, 65535.0f)) & 0xffff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint16_t)CLAMP(src[0], 0.0f, 65535.0f)) & 0xffff;
         value |= (uint32_t)((uint16_t)CLAMP(src[1], 0.0f, 65535.0f)) << 16;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r16g16_uscaled_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = value >> 16;
         uint32_t g = (value) & 0xffff;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = (value) & 0xffff;
         uint32_t g = value >> 16;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_r16g16_uscaled_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = value >> 16;
         uint32_t g = (value) & 0xffff;
         dst[0] = (uint8_t)(((uint32_t)MIN2(r, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)MIN2(g, 1)) * 0xff / 0x1); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = (value) & 0xffff;
         uint32_t g = value >> 16;
         dst[0] = (uint8_t)(((uint32_t)MIN2(r, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)MIN2(g, 1)) * 0xff / 0x1); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_r16g16_uscaled_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint16_t)(((uint32_t)src[0]) * 0x1 / 0xff)) << 16;
         value |= ((uint16_t)(((uint32_t)src[1]) * 0x1 / 0xff)) & 0xffff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint16_t)(((uint32_t)src[0]) * 0x1 / 0xff)) & 0xffff;
         value |= (uint32_t)((uint16_t)(((uint32_t)src[1]) * 0x1 / 0xff)) << 16;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r16g16b16_uscaled {
#if UTIL_ARCH_BIG_ENDIAN
   uint16_t r;
   uint16_t g;
   uint16_t b;
#else
   uint16_t r;
   uint16_t g;
   uint16_t b;
#endif
};

void
util_format_r16g16b16_uscaled_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r16g16b16_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
      src += 6;
      dst += 4;
   }
}

void
util_format_r16g16b16_uscaled_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_uscaled pixel = {0};
         pixel.r = (uint16_t)CLAMP(src[0], 0.0f, 65535.0f);
         pixel.g = (uint16_t)CLAMP(src[1], 0.0f, 65535.0f);
         pixel.b = (uint16_t)CLAMP(src[2], 0.0f, 65535.0f);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r16g16b16_uscaled pixel = {0};
         pixel.r = (uint16_t)CLAMP(src[0], 0.0f, 65535.0f);
         pixel.g = (uint16_t)CLAMP(src[1], 0.0f, 65535.0f);
         pixel.b = (uint16_t)CLAMP(src[2], 0.0f, 65535.0f);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 6;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r16g16b16_uscaled_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r16g16b16_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_r16g16b16_uscaled_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(((uint32_t)MIN2(pixel.r, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)MIN2(pixel.g, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)MIN2(pixel.b, 1)) * 0xff / 0x1); /* b */
         dst[3] = 255; /* a */
#else
         struct util_format_r16g16b16_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(((uint32_t)MIN2(pixel.r, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)MIN2(pixel.g, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)MIN2(pixel.b, 1)) * 0xff / 0x1); /* b */
         dst[3] = 255; /* a */
#endif
      src += 6;
      dst += 4;
   }
}

void
util_format_r16g16b16_uscaled_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_uscaled pixel = {0};
         pixel.r = (uint16_t)(((uint32_t)src[0]) * 0x1 / 0xff);
         pixel.g = (uint16_t)(((uint32_t)src[1]) * 0x1 / 0xff);
         pixel.b = (uint16_t)(((uint32_t)src[2]) * 0x1 / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r16g16b16_uscaled pixel = {0};
         pixel.r = (uint16_t)(((uint32_t)src[0]) * 0x1 / 0xff);
         pixel.g = (uint16_t)(((uint32_t)src[1]) * 0x1 / 0xff);
         pixel.b = (uint16_t)(((uint32_t)src[2]) * 0x1 / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 6;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r16g16b16a16_uscaled {
#if UTIL_ARCH_BIG_ENDIAN
   uint16_t r;
   uint16_t g;
   uint16_t b;
   uint16_t a;
#else
   uint16_t r;
   uint16_t g;
   uint16_t b;
   uint16_t a;
#endif
};

void
util_format_r16g16b16a16_uscaled_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = (float)pixel.a; /* a */
#else
         struct util_format_r16g16b16a16_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = (float)pixel.a; /* a */
#endif
      src += 8;
      dst += 4;
   }
}

void
util_format_r16g16b16a16_uscaled_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_uscaled pixel = {0};
         pixel.r = (uint16_t)CLAMP(src[0], 0.0f, 65535.0f);
         pixel.g = (uint16_t)CLAMP(src[1], 0.0f, 65535.0f);
         pixel.b = (uint16_t)CLAMP(src[2], 0.0f, 65535.0f);
         pixel.a = (uint16_t)CLAMP(src[3], 0.0f, 65535.0f);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r16g16b16a16_uscaled pixel = {0};
         pixel.r = (uint16_t)CLAMP(src[0], 0.0f, 65535.0f);
         pixel.g = (uint16_t)CLAMP(src[1], 0.0f, 65535.0f);
         pixel.b = (uint16_t)CLAMP(src[2], 0.0f, 65535.0f);
         pixel.a = (uint16_t)CLAMP(src[3], 0.0f, 65535.0f);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r16g16b16a16_uscaled_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = (float)pixel.a; /* a */
#else
         struct util_format_r16g16b16a16_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = (float)pixel.a; /* a */
#endif
}

void
util_format_r16g16b16a16_uscaled_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(((uint32_t)MIN2(pixel.r, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)MIN2(pixel.g, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)MIN2(pixel.b, 1)) * 0xff / 0x1); /* b */
         dst[3] = (uint8_t)(((uint32_t)MIN2(pixel.a, 1)) * 0xff / 0x1); /* a */
#else
         struct util_format_r16g16b16a16_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(((uint32_t)MIN2(pixel.r, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)MIN2(pixel.g, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)MIN2(pixel.b, 1)) * 0xff / 0x1); /* b */
         dst[3] = (uint8_t)(((uint32_t)MIN2(pixel.a, 1)) * 0xff / 0x1); /* a */
#endif
      src += 8;
      dst += 4;
   }
}

void
util_format_r16g16b16a16_uscaled_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_uscaled pixel = {0};
         pixel.r = (uint16_t)(((uint32_t)src[0]) * 0x1 / 0xff);
         pixel.g = (uint16_t)(((uint32_t)src[1]) * 0x1 / 0xff);
         pixel.b = (uint16_t)(((uint32_t)src[2]) * 0x1 / 0xff);
         pixel.a = (uint16_t)(((uint32_t)src[3]) * 0x1 / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r16g16b16a16_uscaled pixel = {0};
         pixel.r = (uint16_t)(((uint32_t)src[0]) * 0x1 / 0xff);
         pixel.g = (uint16_t)(((uint32_t)src[1]) * 0x1 / 0xff);
         pixel.b = (uint16_t)(((uint32_t)src[2]) * 0x1 / 0xff);
         pixel.a = (uint16_t)(((uint32_t)src[3]) * 0x1 / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r16_snorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         int16_t r = (int16_t)(value) ;
         dst[0] = (float)(r * (1.0f/0x7fff)); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
      src += 2;
      dst += 4;
   }
}

void
util_format_r16_snorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)((int16_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7fff)) ;
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r16_snorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
         uint16_t value = *(const uint16_t *)src;
         int16_t r = (int16_t)(value) ;
         dst[0] = (float)(r * (1.0f/0x7fff)); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
}

void
util_format_r16_snorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         int16_t r = (int16_t)(value) ;
         dst[0] = _mesa_snorm_to_unorm(MAX2(r, 0), 16, 8); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
      src += 2;
      dst += 4;
   }
}

void
util_format_r16_snorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)(_mesa_unorm_to_snorm(src[0], 8, 16)) ;
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r16g16_snorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t r = ((int32_t)(value) ) >> 16;
         int32_t g = ((int32_t)(value << 16) ) >> 16;
         dst[0] = (float)(r * (1.0f/0x7fff)); /* r */
         dst[1] = (float)(g * (1.0f/0x7fff)); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r = ((int32_t)(value << 16) ) >> 16;
         int32_t g = ((int32_t)(value) ) >> 16;
         dst[0] = (float)(r * (1.0f/0x7fff)); /* r */
         dst[1] = (float)(g * (1.0f/0x7fff)); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_r16g16_snorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((int16_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7fff)) << 16) ;
         value |= (uint32_t)(((int16_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0x7fff)) & 0xffff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((int16_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7fff)) & 0xffff) ;
         value |= (uint32_t)((uint32_t)((int16_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0x7fff)) << 16) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r16g16_snorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t r = ((int32_t)(value) ) >> 16;
         int32_t g = ((int32_t)(value << 16) ) >> 16;
         dst[0] = (float)(r * (1.0f/0x7fff)); /* r */
         dst[1] = (float)(g * (1.0f/0x7fff)); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r = ((int32_t)(value << 16) ) >> 16;
         int32_t g = ((int32_t)(value) ) >> 16;
         dst[0] = (float)(r * (1.0f/0x7fff)); /* r */
         dst[1] = (float)(g * (1.0f/0x7fff)); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_r16g16_snorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t r = ((int32_t)(value) ) >> 16;
         int32_t g = ((int32_t)(value << 16) ) >> 16;
         dst[0] = _mesa_snorm_to_unorm(MAX2(r, 0), 16, 8); /* r */
         dst[1] = _mesa_snorm_to_unorm(MAX2(g, 0), 16, 8); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r = ((int32_t)(value << 16) ) >> 16;
         int32_t g = ((int32_t)(value) ) >> 16;
         dst[0] = _mesa_snorm_to_unorm(MAX2(r, 0), 16, 8); /* r */
         dst[1] = _mesa_snorm_to_unorm(MAX2(g, 0), 16, 8); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_r16g16_snorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)(_mesa_unorm_to_snorm(src[0], 8, 16)) << 16) ;
         value |= (uint32_t)((_mesa_unorm_to_snorm(src[1], 8, 16)) & 0xffff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)((_mesa_unorm_to_snorm(src[0], 8, 16)) & 0xffff) ;
         value |= (uint32_t)((uint32_t)(_mesa_unorm_to_snorm(src[1], 8, 16)) << 16) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r16g16b16_snorm {
#if UTIL_ARCH_BIG_ENDIAN
   int16_t r;
   int16_t g;
   int16_t b;
#else
   int16_t r;
   int16_t g;
   int16_t b;
#endif
};

void
util_format_r16g16b16_snorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0f/0x7fff)); /* r */
         dst[1] = (float)(pixel.g * (1.0f/0x7fff)); /* g */
         dst[2] = (float)(pixel.b * (1.0f/0x7fff)); /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r16g16b16_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0f/0x7fff)); /* r */
         dst[1] = (float)(pixel.g * (1.0f/0x7fff)); /* g */
         dst[2] = (float)(pixel.b * (1.0f/0x7fff)); /* b */
         dst[3] = 1; /* a */
#endif
      src += 6;
      dst += 4;
   }
}

void
util_format_r16g16b16_snorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_snorm pixel = {0};
         pixel.r = (int16_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7fff);
         pixel.g = (int16_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0x7fff);
         pixel.b = (int16_t)util_iround(CLAMP(src[2], -1.0f, 1.0f) * 0x7fff);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r16g16b16_snorm pixel = {0};
         pixel.r = (int16_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7fff);
         pixel.g = (int16_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0x7fff);
         pixel.b = (int16_t)util_iround(CLAMP(src[2], -1.0f, 1.0f) * 0x7fff);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 6;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r16g16b16_snorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0f/0x7fff)); /* r */
         dst[1] = (float)(pixel.g * (1.0f/0x7fff)); /* g */
         dst[2] = (float)(pixel.b * (1.0f/0x7fff)); /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r16g16b16_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0f/0x7fff)); /* r */
         dst[1] = (float)(pixel.g * (1.0f/0x7fff)); /* g */
         dst[2] = (float)(pixel.b * (1.0f/0x7fff)); /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_r16g16b16_snorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = _mesa_snorm_to_unorm(MAX2(pixel.r, 0), 16, 8); /* r */
         dst[1] = _mesa_snorm_to_unorm(MAX2(pixel.g, 0), 16, 8); /* g */
         dst[2] = _mesa_snorm_to_unorm(MAX2(pixel.b, 0), 16, 8); /* b */
         dst[3] = 255; /* a */
#else
         struct util_format_r16g16b16_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = _mesa_snorm_to_unorm(MAX2(pixel.r, 0), 16, 8); /* r */
         dst[1] = _mesa_snorm_to_unorm(MAX2(pixel.g, 0), 16, 8); /* g */
         dst[2] = _mesa_snorm_to_unorm(MAX2(pixel.b, 0), 16, 8); /* b */
         dst[3] = 255; /* a */
#endif
      src += 6;
      dst += 4;
   }
}

void
util_format_r16g16b16_snorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_snorm pixel = {0};
         pixel.r = _mesa_unorm_to_snorm(src[0], 8, 16);
         pixel.g = _mesa_unorm_to_snorm(src[1], 8, 16);
         pixel.b = _mesa_unorm_to_snorm(src[2], 8, 16);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r16g16b16_snorm pixel = {0};
         pixel.r = _mesa_unorm_to_snorm(src[0], 8, 16);
         pixel.g = _mesa_unorm_to_snorm(src[1], 8, 16);
         pixel.b = _mesa_unorm_to_snorm(src[2], 8, 16);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 6;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r16g16b16a16_snorm {
#if UTIL_ARCH_BIG_ENDIAN
   int16_t r;
   int16_t g;
   int16_t b;
   int16_t a;
#else
   int16_t r;
   int16_t g;
   int16_t b;
   int16_t a;
#endif
};

void
util_format_r16g16b16a16_snorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0f/0x7fff)); /* r */
         dst[1] = (float)(pixel.g * (1.0f/0x7fff)); /* g */
         dst[2] = (float)(pixel.b * (1.0f/0x7fff)); /* b */
         dst[3] = (float)(pixel.a * (1.0f/0x7fff)); /* a */
#else
         struct util_format_r16g16b16a16_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0f/0x7fff)); /* r */
         dst[1] = (float)(pixel.g * (1.0f/0x7fff)); /* g */
         dst[2] = (float)(pixel.b * (1.0f/0x7fff)); /* b */
         dst[3] = (float)(pixel.a * (1.0f/0x7fff)); /* a */
#endif
      src += 8;
      dst += 4;
   }
}

void
util_format_r16g16b16a16_snorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_snorm pixel = {0};
         pixel.r = (int16_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7fff);
         pixel.g = (int16_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0x7fff);
         pixel.b = (int16_t)util_iround(CLAMP(src[2], -1.0f, 1.0f) * 0x7fff);
         pixel.a = (int16_t)util_iround(CLAMP(src[3], -1.0f, 1.0f) * 0x7fff);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r16g16b16a16_snorm pixel = {0};
         pixel.r = (int16_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7fff);
         pixel.g = (int16_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0x7fff);
         pixel.b = (int16_t)util_iround(CLAMP(src[2], -1.0f, 1.0f) * 0x7fff);
         pixel.a = (int16_t)util_iround(CLAMP(src[3], -1.0f, 1.0f) * 0x7fff);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r16g16b16a16_snorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0f/0x7fff)); /* r */
         dst[1] = (float)(pixel.g * (1.0f/0x7fff)); /* g */
         dst[2] = (float)(pixel.b * (1.0f/0x7fff)); /* b */
         dst[3] = (float)(pixel.a * (1.0f/0x7fff)); /* a */
#else
         struct util_format_r16g16b16a16_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0f/0x7fff)); /* r */
         dst[1] = (float)(pixel.g * (1.0f/0x7fff)); /* g */
         dst[2] = (float)(pixel.b * (1.0f/0x7fff)); /* b */
         dst[3] = (float)(pixel.a * (1.0f/0x7fff)); /* a */
#endif
}

void
util_format_r16g16b16a16_snorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = _mesa_snorm_to_unorm(MAX2(pixel.r, 0), 16, 8); /* r */
         dst[1] = _mesa_snorm_to_unorm(MAX2(pixel.g, 0), 16, 8); /* g */
         dst[2] = _mesa_snorm_to_unorm(MAX2(pixel.b, 0), 16, 8); /* b */
         dst[3] = _mesa_snorm_to_unorm(MAX2(pixel.a, 0), 16, 8); /* a */
#else
         struct util_format_r16g16b16a16_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = _mesa_snorm_to_unorm(MAX2(pixel.r, 0), 16, 8); /* r */
         dst[1] = _mesa_snorm_to_unorm(MAX2(pixel.g, 0), 16, 8); /* g */
         dst[2] = _mesa_snorm_to_unorm(MAX2(pixel.b, 0), 16, 8); /* b */
         dst[3] = _mesa_snorm_to_unorm(MAX2(pixel.a, 0), 16, 8); /* a */
#endif
      src += 8;
      dst += 4;
   }
}

void
util_format_r16g16b16a16_snorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_snorm pixel = {0};
         pixel.r = _mesa_unorm_to_snorm(src[0], 8, 16);
         pixel.g = _mesa_unorm_to_snorm(src[1], 8, 16);
         pixel.b = _mesa_unorm_to_snorm(src[2], 8, 16);
         pixel.a = _mesa_unorm_to_snorm(src[3], 8, 16);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r16g16b16a16_snorm pixel = {0};
         pixel.r = _mesa_unorm_to_snorm(src[0], 8, 16);
         pixel.g = _mesa_unorm_to_snorm(src[1], 8, 16);
         pixel.b = _mesa_unorm_to_snorm(src[2], 8, 16);
         pixel.a = _mesa_unorm_to_snorm(src[3], 8, 16);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r16_sscaled_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         int16_t r = (int16_t)(value) ;
         dst[0] = (float)r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
      src += 2;
      dst += 4;
   }
}

void
util_format_r16_sscaled_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)((int16_t)CLAMP(src[0], -32768.0f, 32767.0f)) ;
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r16_sscaled_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
         uint16_t value = *(const uint16_t *)src;
         int16_t r = (int16_t)(value) ;
         dst[0] = (float)r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
}

void
util_format_r16_sscaled_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         int16_t r = (int16_t)(value) ;
         dst[0] = (uint8_t)(((uint32_t)CLAMP(r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
      src += 2;
      dst += 4;
   }
}

void
util_format_r16_sscaled_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)((int16_t)(((uint32_t)src[0]) * 0x1 / 0xff)) ;
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r16g16_sscaled_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t r = ((int32_t)(value) ) >> 16;
         int32_t g = ((int32_t)(value << 16) ) >> 16;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r = ((int32_t)(value << 16) ) >> 16;
         int32_t g = ((int32_t)(value) ) >> 16;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_r16g16_sscaled_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((int16_t)CLAMP(src[0], -32768.0f, 32767.0f)) << 16) ;
         value |= (uint32_t)(((int16_t)CLAMP(src[1], -32768.0f, 32767.0f)) & 0xffff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((int16_t)CLAMP(src[0], -32768.0f, 32767.0f)) & 0xffff) ;
         value |= (uint32_t)((uint32_t)((int16_t)CLAMP(src[1], -32768.0f, 32767.0f)) << 16) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r16g16_sscaled_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t r = ((int32_t)(value) ) >> 16;
         int32_t g = ((int32_t)(value << 16) ) >> 16;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r = ((int32_t)(value << 16) ) >> 16;
         int32_t g = ((int32_t)(value) ) >> 16;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_r16g16_sscaled_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t r = ((int32_t)(value) ) >> 16;
         int32_t g = ((int32_t)(value << 16) ) >> 16;
         dst[0] = (uint8_t)(((uint32_t)CLAMP(r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)CLAMP(g, 0, 1)) * 0xff / 0x1); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r = ((int32_t)(value << 16) ) >> 16;
         int32_t g = ((int32_t)(value) ) >> 16;
         dst[0] = (uint8_t)(((uint32_t)CLAMP(r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)CLAMP(g, 0, 1)) * 0xff / 0x1); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_r16g16_sscaled_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((int16_t)(((uint32_t)src[0]) * 0x1 / 0xff)) << 16) ;
         value |= (uint32_t)(((int16_t)(((uint32_t)src[1]) * 0x1 / 0xff)) & 0xffff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((int16_t)(((uint32_t)src[0]) * 0x1 / 0xff)) & 0xffff) ;
         value |= (uint32_t)((uint32_t)((int16_t)(((uint32_t)src[1]) * 0x1 / 0xff)) << 16) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r16g16b16_sscaled {
#if UTIL_ARCH_BIG_ENDIAN
   int16_t r;
   int16_t g;
   int16_t b;
#else
   int16_t r;
   int16_t g;
   int16_t b;
#endif
};

void
util_format_r16g16b16_sscaled_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r16g16b16_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
      src += 6;
      dst += 4;
   }
}

void
util_format_r16g16b16_sscaled_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_sscaled pixel = {0};
         pixel.r = (int16_t)CLAMP(src[0], -32768.0f, 32767.0f);
         pixel.g = (int16_t)CLAMP(src[1], -32768.0f, 32767.0f);
         pixel.b = (int16_t)CLAMP(src[2], -32768.0f, 32767.0f);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r16g16b16_sscaled pixel = {0};
         pixel.r = (int16_t)CLAMP(src[0], -32768.0f, 32767.0f);
         pixel.g = (int16_t)CLAMP(src[1], -32768.0f, 32767.0f);
         pixel.b = (int16_t)CLAMP(src[2], -32768.0f, 32767.0f);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 6;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r16g16b16_sscaled_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r16g16b16_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_r16g16b16_sscaled_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(((uint32_t)CLAMP(pixel.r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)CLAMP(pixel.g, 0, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)CLAMP(pixel.b, 0, 1)) * 0xff / 0x1); /* b */
         dst[3] = 255; /* a */
#else
         struct util_format_r16g16b16_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(((uint32_t)CLAMP(pixel.r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)CLAMP(pixel.g, 0, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)CLAMP(pixel.b, 0, 1)) * 0xff / 0x1); /* b */
         dst[3] = 255; /* a */
#endif
      src += 6;
      dst += 4;
   }
}

void
util_format_r16g16b16_sscaled_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_sscaled pixel = {0};
         pixel.r = (int16_t)(((uint32_t)src[0]) * 0x1 / 0xff);
         pixel.g = (int16_t)(((uint32_t)src[1]) * 0x1 / 0xff);
         pixel.b = (int16_t)(((uint32_t)src[2]) * 0x1 / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r16g16b16_sscaled pixel = {0};
         pixel.r = (int16_t)(((uint32_t)src[0]) * 0x1 / 0xff);
         pixel.g = (int16_t)(((uint32_t)src[1]) * 0x1 / 0xff);
         pixel.b = (int16_t)(((uint32_t)src[2]) * 0x1 / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 6;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r16g16b16a16_sscaled {
#if UTIL_ARCH_BIG_ENDIAN
   int16_t r;
   int16_t g;
   int16_t b;
   int16_t a;
#else
   int16_t r;
   int16_t g;
   int16_t b;
   int16_t a;
#endif
};

void
util_format_r16g16b16a16_sscaled_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = (float)pixel.a; /* a */
#else
         struct util_format_r16g16b16a16_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = (float)pixel.a; /* a */
#endif
      src += 8;
      dst += 4;
   }
}

void
util_format_r16g16b16a16_sscaled_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_sscaled pixel = {0};
         pixel.r = (int16_t)CLAMP(src[0], -32768.0f, 32767.0f);
         pixel.g = (int16_t)CLAMP(src[1], -32768.0f, 32767.0f);
         pixel.b = (int16_t)CLAMP(src[2], -32768.0f, 32767.0f);
         pixel.a = (int16_t)CLAMP(src[3], -32768.0f, 32767.0f);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r16g16b16a16_sscaled pixel = {0};
         pixel.r = (int16_t)CLAMP(src[0], -32768.0f, 32767.0f);
         pixel.g = (int16_t)CLAMP(src[1], -32768.0f, 32767.0f);
         pixel.b = (int16_t)CLAMP(src[2], -32768.0f, 32767.0f);
         pixel.a = (int16_t)CLAMP(src[3], -32768.0f, 32767.0f);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r16g16b16a16_sscaled_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = (float)pixel.a; /* a */
#else
         struct util_format_r16g16b16a16_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = (float)pixel.a; /* a */
#endif
}

void
util_format_r16g16b16a16_sscaled_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(((uint32_t)CLAMP(pixel.r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)CLAMP(pixel.g, 0, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)CLAMP(pixel.b, 0, 1)) * 0xff / 0x1); /* b */
         dst[3] = (uint8_t)(((uint32_t)CLAMP(pixel.a, 0, 1)) * 0xff / 0x1); /* a */
#else
         struct util_format_r16g16b16a16_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(((uint32_t)CLAMP(pixel.r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)CLAMP(pixel.g, 0, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)CLAMP(pixel.b, 0, 1)) * 0xff / 0x1); /* b */
         dst[3] = (uint8_t)(((uint32_t)CLAMP(pixel.a, 0, 1)) * 0xff / 0x1); /* a */
#endif
      src += 8;
      dst += 4;
   }
}

void
util_format_r16g16b16a16_sscaled_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_sscaled pixel = {0};
         pixel.r = (int16_t)(((uint32_t)src[0]) * 0x1 / 0xff);
         pixel.g = (int16_t)(((uint32_t)src[1]) * 0x1 / 0xff);
         pixel.b = (int16_t)(((uint32_t)src[2]) * 0x1 / 0xff);
         pixel.a = (int16_t)(((uint32_t)src[3]) * 0x1 / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r16g16b16a16_sscaled pixel = {0};
         pixel.r = (int16_t)(((uint32_t)src[0]) * 0x1 / 0xff);
         pixel.g = (int16_t)(((uint32_t)src[1]) * 0x1 / 0xff);
         pixel.b = (int16_t)(((uint32_t)src[2]) * 0x1 / 0xff);
         pixel.a = (int16_t)(((uint32_t)src[3]) * 0x1 / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r8_unorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         uint8_t r = value;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
      src += 1;
      dst += 4;
   }
}

void
util_format_r8_unorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= float_to_ubyte(src[0]);
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r8_unorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
         uint8_t value = *(const uint8_t *)src;
         uint8_t r = value;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
}

void
util_format_r8_unorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         uint8_t r = value;
         dst[0] = r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
      src += 1;
      dst += 4;
   }
}

void
util_format_r8_unorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= src[0];
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r8g8_unorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = value >> 8;
         uint16_t g = (value) & 0xff;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = (value) & 0xff;
         uint16_t g = value >> 8;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_r8g8_unorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)(float_to_ubyte(src[0])) << 8;
         value |= (float_to_ubyte(src[1])) & 0xff;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (float_to_ubyte(src[0])) & 0xff;
         value |= (uint32_t)(float_to_ubyte(src[1])) << 8;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r8g8_unorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = value >> 8;
         uint16_t g = (value) & 0xff;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = (value) & 0xff;
         uint16_t g = value >> 8;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_r8g8_unorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = value >> 8;
         uint16_t g = (value) & 0xff;
         dst[0] = r; /* r */
         dst[1] = g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = (value) & 0xff;
         uint16_t g = value >> 8;
         dst[0] = r; /* r */
         dst[1] = g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_r8g8_unorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)(src[0]) << 8;
         value |= (src[1]) & 0xff;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (src[0]) & 0xff;
         value |= (uint32_t)(src[1]) << 8;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r8g8b8_unorm {
#if UTIL_ARCH_BIG_ENDIAN
   uint8_t r;
   uint8_t g;
   uint8_t b;
#else
   uint8_t r;
   uint8_t g;
   uint8_t b;
#endif
};

void
util_format_r8g8b8_unorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = ubyte_to_float(pixel.r); /* r */
         dst[1] = ubyte_to_float(pixel.g); /* g */
         dst[2] = ubyte_to_float(pixel.b); /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r8g8b8_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = ubyte_to_float(pixel.r); /* r */
         dst[1] = ubyte_to_float(pixel.g); /* g */
         dst[2] = ubyte_to_float(pixel.b); /* b */
         dst[3] = 1; /* a */
#endif
      src += 3;
      dst += 4;
   }
}

void
util_format_r8g8b8_unorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_unorm pixel = {0};
         pixel.r = float_to_ubyte(src[0]);
         pixel.g = float_to_ubyte(src[1]);
         pixel.b = float_to_ubyte(src[2]);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r8g8b8_unorm pixel = {0};
         pixel.r = float_to_ubyte(src[0]);
         pixel.g = float_to_ubyte(src[1]);
         pixel.b = float_to_ubyte(src[2]);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 3;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r8g8b8_unorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = ubyte_to_float(pixel.r); /* r */
         dst[1] = ubyte_to_float(pixel.g); /* g */
         dst[2] = ubyte_to_float(pixel.b); /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r8g8b8_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = ubyte_to_float(pixel.r); /* r */
         dst[1] = ubyte_to_float(pixel.g); /* g */
         dst[2] = ubyte_to_float(pixel.b); /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_r8g8b8_unorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = pixel.b; /* b */
         dst[3] = 255; /* a */
#else
         struct util_format_r8g8b8_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = pixel.b; /* b */
         dst[3] = 255; /* a */
#endif
      src += 3;
      dst += 4;
   }
}

void
util_format_r8g8b8_unorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_unorm pixel = {0};
         pixel.r = src[0];
         pixel.g = src[1];
         pixel.b = src[2];
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r8g8b8_unorm pixel = {0};
         pixel.r = src[0];
         pixel.g = src[1];
         pixel.b = src[2];
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 3;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_b8g8r8_unorm {
#if UTIL_ARCH_BIG_ENDIAN
   uint8_t b;
   uint8_t g;
   uint8_t r;
#else
   uint8_t b;
   uint8_t g;
   uint8_t r;
#endif
};

void
util_format_b8g8r8_unorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = ubyte_to_float(pixel.r); /* r */
         dst[1] = ubyte_to_float(pixel.g); /* g */
         dst[2] = ubyte_to_float(pixel.b); /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_b8g8r8_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = ubyte_to_float(pixel.r); /* r */
         dst[1] = ubyte_to_float(pixel.g); /* g */
         dst[2] = ubyte_to_float(pixel.b); /* b */
         dst[3] = 1; /* a */
#endif
      src += 3;
      dst += 4;
   }
}

void
util_format_b8g8r8_unorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_unorm pixel = {0};
         pixel.b = float_to_ubyte(src[2]);
         pixel.g = float_to_ubyte(src[1]);
         pixel.r = float_to_ubyte(src[0]);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_b8g8r8_unorm pixel = {0};
         pixel.b = float_to_ubyte(src[2]);
         pixel.g = float_to_ubyte(src[1]);
         pixel.r = float_to_ubyte(src[0]);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 3;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b8g8r8_unorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = ubyte_to_float(pixel.r); /* r */
         dst[1] = ubyte_to_float(pixel.g); /* g */
         dst[2] = ubyte_to_float(pixel.b); /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_b8g8r8_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = ubyte_to_float(pixel.r); /* r */
         dst[1] = ubyte_to_float(pixel.g); /* g */
         dst[2] = ubyte_to_float(pixel.b); /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_b8g8r8_unorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = pixel.b; /* b */
         dst[3] = 255; /* a */
#else
         struct util_format_b8g8r8_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = pixel.b; /* b */
         dst[3] = 255; /* a */
#endif
      src += 3;
      dst += 4;
   }
}

void
util_format_b8g8r8_unorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_unorm pixel = {0};
         pixel.b = src[2];
         pixel.g = src[1];
         pixel.r = src[0];
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_b8g8r8_unorm pixel = {0};
         pixel.b = src[2];
         pixel.g = src[1];
         pixel.r = src[0];
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 3;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r8g8b8a8_unorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = value >> 24;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t b = (value >> 8) & 0xff;
         uint32_t a = (value) & 0xff;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = (value) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t b = (value >> 16) & 0xff;
         uint32_t a = value >> 24;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_r8g8b8a8_unorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(float_to_ubyte(src[0])) << 24;
         value |= (uint32_t)((float_to_ubyte(src[1])) & 0xff) << 16;
         value |= (uint32_t)((float_to_ubyte(src[2])) & 0xff) << 8;
         value |= (float_to_ubyte(src[3])) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (float_to_ubyte(src[0])) & 0xff;
         value |= (uint32_t)((float_to_ubyte(src[1])) & 0xff) << 8;
         value |= (uint32_t)((float_to_ubyte(src[2])) & 0xff) << 16;
         value |= (uint32_t)(float_to_ubyte(src[3])) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r8g8b8a8_unorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = value >> 24;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t b = (value >> 8) & 0xff;
         uint32_t a = (value) & 0xff;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = (value) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t b = (value >> 16) & 0xff;
         uint32_t a = value >> 24;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#endif
}

void
util_format_r8g8b8a8_unorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = value >> 24;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t b = (value >> 8) & 0xff;
         uint32_t a = (value) & 0xff;
         dst[0] = r; /* r */
         dst[1] = g; /* g */
         dst[2] = b; /* b */
         dst[3] = a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = (value) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t b = (value >> 16) & 0xff;
         uint32_t a = value >> 24;
         dst[0] = r; /* r */
         dst[1] = g; /* g */
         dst[2] = b; /* b */
         dst[3] = a; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_r8g8b8a8_unorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(src[0]) << 24;
         value |= (uint32_t)((src[1]) & 0xff) << 16;
         value |= (uint32_t)((src[2]) & 0xff) << 8;
         value |= (src[3]) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (src[0]) & 0xff;
         value |= (uint32_t)((src[1]) & 0xff) << 8;
         value |= (uint32_t)((src[2]) & 0xff) << 16;
         value |= (uint32_t)(src[3]) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r8_uscaled_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         uint8_t r = value;
         dst[0] = (float)r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
      src += 1;
      dst += 4;
   }
}

void
util_format_r8_uscaled_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= (uint8_t)CLAMP(src[0], 0.0f, 255.0f);
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r8_uscaled_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
         uint8_t value = *(const uint8_t *)src;
         uint8_t r = value;
         dst[0] = (float)r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
}

void
util_format_r8_uscaled_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         uint8_t r = value;
         dst[0] = (uint8_t)(((uint32_t)MIN2(r, 1)) * 0xff / 0x1); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
      src += 1;
      dst += 4;
   }
}

void
util_format_r8_uscaled_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= (uint8_t)(((uint32_t)src[0]) * 0x1 / 0xff);
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r8g8_uscaled_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = value >> 8;
         uint16_t g = (value) & 0xff;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = (value) & 0xff;
         uint16_t g = value >> 8;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_r8g8_uscaled_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint8_t)CLAMP(src[0], 0.0f, 255.0f)) << 8;
         value |= ((uint8_t)CLAMP(src[1], 0.0f, 255.0f)) & 0xff;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint8_t)CLAMP(src[0], 0.0f, 255.0f)) & 0xff;
         value |= (uint32_t)((uint8_t)CLAMP(src[1], 0.0f, 255.0f)) << 8;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r8g8_uscaled_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = value >> 8;
         uint16_t g = (value) & 0xff;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = (value) & 0xff;
         uint16_t g = value >> 8;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_r8g8_uscaled_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = value >> 8;
         uint16_t g = (value) & 0xff;
         dst[0] = (uint8_t)(((uint32_t)MIN2(r, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)MIN2(g, 1)) * 0xff / 0x1); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = (value) & 0xff;
         uint16_t g = value >> 8;
         dst[0] = (uint8_t)(((uint32_t)MIN2(r, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)MIN2(g, 1)) * 0xff / 0x1); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_r8g8_uscaled_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint8_t)(((uint32_t)src[0]) * 0x1 / 0xff)) << 8;
         value |= ((uint8_t)(((uint32_t)src[1]) * 0x1 / 0xff)) & 0xff;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint8_t)(((uint32_t)src[0]) * 0x1 / 0xff)) & 0xff;
         value |= (uint32_t)((uint8_t)(((uint32_t)src[1]) * 0x1 / 0xff)) << 8;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r8g8b8_uscaled {
#if UTIL_ARCH_BIG_ENDIAN
   uint8_t r;
   uint8_t g;
   uint8_t b;
#else
   uint8_t r;
   uint8_t g;
   uint8_t b;
#endif
};

void
util_format_r8g8b8_uscaled_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r8g8b8_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
      src += 3;
      dst += 4;
   }
}

void
util_format_r8g8b8_uscaled_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_uscaled pixel = {0};
         pixel.r = (uint8_t)CLAMP(src[0], 0.0f, 255.0f);
         pixel.g = (uint8_t)CLAMP(src[1], 0.0f, 255.0f);
         pixel.b = (uint8_t)CLAMP(src[2], 0.0f, 255.0f);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r8g8b8_uscaled pixel = {0};
         pixel.r = (uint8_t)CLAMP(src[0], 0.0f, 255.0f);
         pixel.g = (uint8_t)CLAMP(src[1], 0.0f, 255.0f);
         pixel.b = (uint8_t)CLAMP(src[2], 0.0f, 255.0f);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 3;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r8g8b8_uscaled_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r8g8b8_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_r8g8b8_uscaled_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(((uint32_t)MIN2(pixel.r, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)MIN2(pixel.g, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)MIN2(pixel.b, 1)) * 0xff / 0x1); /* b */
         dst[3] = 255; /* a */
#else
         struct util_format_r8g8b8_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(((uint32_t)MIN2(pixel.r, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)MIN2(pixel.g, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)MIN2(pixel.b, 1)) * 0xff / 0x1); /* b */
         dst[3] = 255; /* a */
#endif
      src += 3;
      dst += 4;
   }
}

void
util_format_r8g8b8_uscaled_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_uscaled pixel = {0};
         pixel.r = (uint8_t)(((uint32_t)src[0]) * 0x1 / 0xff);
         pixel.g = (uint8_t)(((uint32_t)src[1]) * 0x1 / 0xff);
         pixel.b = (uint8_t)(((uint32_t)src[2]) * 0x1 / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r8g8b8_uscaled pixel = {0};
         pixel.r = (uint8_t)(((uint32_t)src[0]) * 0x1 / 0xff);
         pixel.g = (uint8_t)(((uint32_t)src[1]) * 0x1 / 0xff);
         pixel.b = (uint8_t)(((uint32_t)src[2]) * 0x1 / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 3;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_b8g8r8_uscaled {
#if UTIL_ARCH_BIG_ENDIAN
   uint8_t b;
   uint8_t g;
   uint8_t r;
#else
   uint8_t b;
   uint8_t g;
   uint8_t r;
#endif
};

void
util_format_b8g8r8_uscaled_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_b8g8r8_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
      src += 3;
      dst += 4;
   }
}

void
util_format_b8g8r8_uscaled_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_uscaled pixel = {0};
         pixel.b = (uint8_t)CLAMP(src[2], 0.0f, 255.0f);
         pixel.g = (uint8_t)CLAMP(src[1], 0.0f, 255.0f);
         pixel.r = (uint8_t)CLAMP(src[0], 0.0f, 255.0f);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_b8g8r8_uscaled pixel = {0};
         pixel.b = (uint8_t)CLAMP(src[2], 0.0f, 255.0f);
         pixel.g = (uint8_t)CLAMP(src[1], 0.0f, 255.0f);
         pixel.r = (uint8_t)CLAMP(src[0], 0.0f, 255.0f);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 3;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b8g8r8_uscaled_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_b8g8r8_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_b8g8r8_uscaled_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(((uint32_t)MIN2(pixel.r, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)MIN2(pixel.g, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)MIN2(pixel.b, 1)) * 0xff / 0x1); /* b */
         dst[3] = 255; /* a */
#else
         struct util_format_b8g8r8_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(((uint32_t)MIN2(pixel.r, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)MIN2(pixel.g, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)MIN2(pixel.b, 1)) * 0xff / 0x1); /* b */
         dst[3] = 255; /* a */
#endif
      src += 3;
      dst += 4;
   }
}

void
util_format_b8g8r8_uscaled_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_uscaled pixel = {0};
         pixel.b = (uint8_t)(((uint32_t)src[2]) * 0x1 / 0xff);
         pixel.g = (uint8_t)(((uint32_t)src[1]) * 0x1 / 0xff);
         pixel.r = (uint8_t)(((uint32_t)src[0]) * 0x1 / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_b8g8r8_uscaled pixel = {0};
         pixel.b = (uint8_t)(((uint32_t)src[2]) * 0x1 / 0xff);
         pixel.g = (uint8_t)(((uint32_t)src[1]) * 0x1 / 0xff);
         pixel.r = (uint8_t)(((uint32_t)src[0]) * 0x1 / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 3;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r8g8b8a8_uscaled_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = value >> 24;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t b = (value >> 8) & 0xff;
         uint32_t a = (value) & 0xff;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = (value) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t b = (value >> 16) & 0xff;
         uint32_t a = value >> 24;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_r8g8b8a8_uscaled_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint8_t)CLAMP(src[0], 0.0f, 255.0f)) << 24;
         value |= (uint32_t)(((uint8_t)CLAMP(src[1], 0.0f, 255.0f)) & 0xff) << 16;
         value |= (uint32_t)(((uint8_t)CLAMP(src[2], 0.0f, 255.0f)) & 0xff) << 8;
         value |= ((uint8_t)CLAMP(src[3], 0.0f, 255.0f)) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint8_t)CLAMP(src[0], 0.0f, 255.0f)) & 0xff;
         value |= (uint32_t)(((uint8_t)CLAMP(src[1], 0.0f, 255.0f)) & 0xff) << 8;
         value |= (uint32_t)(((uint8_t)CLAMP(src[2], 0.0f, 255.0f)) & 0xff) << 16;
         value |= (uint32_t)((uint8_t)CLAMP(src[3], 0.0f, 255.0f)) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r8g8b8a8_uscaled_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = value >> 24;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t b = (value >> 8) & 0xff;
         uint32_t a = (value) & 0xff;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = (value) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t b = (value >> 16) & 0xff;
         uint32_t a = value >> 24;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#endif
}

void
util_format_r8g8b8a8_uscaled_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = value >> 24;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t b = (value >> 8) & 0xff;
         uint32_t a = (value) & 0xff;
         dst[0] = (uint8_t)(((uint32_t)MIN2(r, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)MIN2(g, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)MIN2(b, 1)) * 0xff / 0x1); /* b */
         dst[3] = (uint8_t)(((uint32_t)MIN2(a, 1)) * 0xff / 0x1); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = (value) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t b = (value >> 16) & 0xff;
         uint32_t a = value >> 24;
         dst[0] = (uint8_t)(((uint32_t)MIN2(r, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)MIN2(g, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)MIN2(b, 1)) * 0xff / 0x1); /* b */
         dst[3] = (uint8_t)(((uint32_t)MIN2(a, 1)) * 0xff / 0x1); /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_r8g8b8a8_uscaled_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint8_t)(((uint32_t)src[0]) * 0x1 / 0xff)) << 24;
         value |= (uint32_t)(((uint8_t)(((uint32_t)src[1]) * 0x1 / 0xff)) & 0xff) << 16;
         value |= (uint32_t)(((uint8_t)(((uint32_t)src[2]) * 0x1 / 0xff)) & 0xff) << 8;
         value |= ((uint8_t)(((uint32_t)src[3]) * 0x1 / 0xff)) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint8_t)(((uint32_t)src[0]) * 0x1 / 0xff)) & 0xff;
         value |= (uint32_t)(((uint8_t)(((uint32_t)src[1]) * 0x1 / 0xff)) & 0xff) << 8;
         value |= (uint32_t)(((uint8_t)(((uint32_t)src[2]) * 0x1 / 0xff)) & 0xff) << 16;
         value |= (uint32_t)((uint8_t)(((uint32_t)src[3]) * 0x1 / 0xff)) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b8g8r8a8_uscaled_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = value >> 24;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t r = (value >> 8) & 0xff;
         uint32_t a = (value) & 0xff;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = (value) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t r = (value >> 16) & 0xff;
         uint32_t a = value >> 24;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_b8g8r8a8_uscaled_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint8_t)CLAMP(src[2], 0.0f, 255.0f)) << 24;
         value |= (uint32_t)(((uint8_t)CLAMP(src[1], 0.0f, 255.0f)) & 0xff) << 16;
         value |= (uint32_t)(((uint8_t)CLAMP(src[0], 0.0f, 255.0f)) & 0xff) << 8;
         value |= ((uint8_t)CLAMP(src[3], 0.0f, 255.0f)) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint8_t)CLAMP(src[2], 0.0f, 255.0f)) & 0xff;
         value |= (uint32_t)(((uint8_t)CLAMP(src[1], 0.0f, 255.0f)) & 0xff) << 8;
         value |= (uint32_t)(((uint8_t)CLAMP(src[0], 0.0f, 255.0f)) & 0xff) << 16;
         value |= (uint32_t)((uint8_t)CLAMP(src[3], 0.0f, 255.0f)) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b8g8r8a8_uscaled_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = value >> 24;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t r = (value >> 8) & 0xff;
         uint32_t a = (value) & 0xff;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = (value) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t r = (value >> 16) & 0xff;
         uint32_t a = value >> 24;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#endif
}

void
util_format_b8g8r8a8_uscaled_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = value >> 24;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t r = (value >> 8) & 0xff;
         uint32_t a = (value) & 0xff;
         dst[0] = (uint8_t)(((uint32_t)MIN2(r, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)MIN2(g, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)MIN2(b, 1)) * 0xff / 0x1); /* b */
         dst[3] = (uint8_t)(((uint32_t)MIN2(a, 1)) * 0xff / 0x1); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = (value) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t r = (value >> 16) & 0xff;
         uint32_t a = value >> 24;
         dst[0] = (uint8_t)(((uint32_t)MIN2(r, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)MIN2(g, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)MIN2(b, 1)) * 0xff / 0x1); /* b */
         dst[3] = (uint8_t)(((uint32_t)MIN2(a, 1)) * 0xff / 0x1); /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_b8g8r8a8_uscaled_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint8_t)(((uint32_t)src[2]) * 0x1 / 0xff)) << 24;
         value |= (uint32_t)(((uint8_t)(((uint32_t)src[1]) * 0x1 / 0xff)) & 0xff) << 16;
         value |= (uint32_t)(((uint8_t)(((uint32_t)src[0]) * 0x1 / 0xff)) & 0xff) << 8;
         value |= ((uint8_t)(((uint32_t)src[3]) * 0x1 / 0xff)) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint8_t)(((uint32_t)src[2]) * 0x1 / 0xff)) & 0xff;
         value |= (uint32_t)(((uint8_t)(((uint32_t)src[1]) * 0x1 / 0xff)) & 0xff) << 8;
         value |= (uint32_t)(((uint8_t)(((uint32_t)src[0]) * 0x1 / 0xff)) & 0xff) << 16;
         value |= (uint32_t)((uint8_t)(((uint32_t)src[3]) * 0x1 / 0xff)) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a8b8g8r8_uscaled_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = value >> 24;
         uint32_t b = (value >> 16) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t r = (value) & 0xff;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = (value) & 0xff;
         uint32_t b = (value >> 8) & 0xff;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t r = value >> 24;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_a8b8g8r8_uscaled_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint8_t)CLAMP(src[3], 0.0f, 255.0f)) << 24;
         value |= (uint32_t)(((uint8_t)CLAMP(src[2], 0.0f, 255.0f)) & 0xff) << 16;
         value |= (uint32_t)(((uint8_t)CLAMP(src[1], 0.0f, 255.0f)) & 0xff) << 8;
         value |= ((uint8_t)CLAMP(src[0], 0.0f, 255.0f)) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint8_t)CLAMP(src[3], 0.0f, 255.0f)) & 0xff;
         value |= (uint32_t)(((uint8_t)CLAMP(src[2], 0.0f, 255.0f)) & 0xff) << 8;
         value |= (uint32_t)(((uint8_t)CLAMP(src[1], 0.0f, 255.0f)) & 0xff) << 16;
         value |= (uint32_t)((uint8_t)CLAMP(src[0], 0.0f, 255.0f)) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a8b8g8r8_uscaled_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = value >> 24;
         uint32_t b = (value >> 16) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t r = (value) & 0xff;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = (value) & 0xff;
         uint32_t b = (value >> 8) & 0xff;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t r = value >> 24;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#endif
}

void
util_format_a8b8g8r8_uscaled_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = value >> 24;
         uint32_t b = (value >> 16) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t r = (value) & 0xff;
         dst[0] = (uint8_t)(((uint32_t)MIN2(r, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)MIN2(g, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)MIN2(b, 1)) * 0xff / 0x1); /* b */
         dst[3] = (uint8_t)(((uint32_t)MIN2(a, 1)) * 0xff / 0x1); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = (value) & 0xff;
         uint32_t b = (value >> 8) & 0xff;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t r = value >> 24;
         dst[0] = (uint8_t)(((uint32_t)MIN2(r, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)MIN2(g, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)MIN2(b, 1)) * 0xff / 0x1); /* b */
         dst[3] = (uint8_t)(((uint32_t)MIN2(a, 1)) * 0xff / 0x1); /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_a8b8g8r8_uscaled_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint8_t)(((uint32_t)src[3]) * 0x1 / 0xff)) << 24;
         value |= (uint32_t)(((uint8_t)(((uint32_t)src[2]) * 0x1 / 0xff)) & 0xff) << 16;
         value |= (uint32_t)(((uint8_t)(((uint32_t)src[1]) * 0x1 / 0xff)) & 0xff) << 8;
         value |= ((uint8_t)(((uint32_t)src[0]) * 0x1 / 0xff)) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint8_t)(((uint32_t)src[3]) * 0x1 / 0xff)) & 0xff;
         value |= (uint32_t)(((uint8_t)(((uint32_t)src[2]) * 0x1 / 0xff)) & 0xff) << 8;
         value |= (uint32_t)(((uint8_t)(((uint32_t)src[1]) * 0x1 / 0xff)) & 0xff) << 16;
         value |= (uint32_t)((uint8_t)(((uint32_t)src[0]) * 0x1 / 0xff)) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r8_snorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         int8_t r = (int8_t)(value) ;
         dst[0] = (float)(r * (1.0f/0x7f)); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
      src += 1;
      dst += 4;
   }
}

void
util_format_r8_snorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= (uint8_t)((int8_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7f)) ;
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r8_snorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
         uint8_t value = *(const uint8_t *)src;
         int8_t r = (int8_t)(value) ;
         dst[0] = (float)(r * (1.0f/0x7f)); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
}

void
util_format_r8_snorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         int8_t r = (int8_t)(value) ;
         dst[0] = _mesa_snorm_to_unorm(MAX2(r, 0), 8, 8); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
      src += 1;
      dst += 4;
   }
}

void
util_format_r8_snorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= (uint8_t)(_mesa_unorm_to_snorm(src[0], 8, 8)) ;
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r8g8_snorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         int16_t r = ((int16_t)(value) ) >> 8;
         int16_t g = ((int16_t)(value << 8) ) >> 8;
         dst[0] = (float)(r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(g * (1.0f/0x7f)); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         int16_t r = ((int16_t)(value << 8) ) >> 8;
         int16_t g = ((int16_t)(value) ) >> 8;
         dst[0] = (float)(r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(g * (1.0f/0x7f)); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_r8g8_snorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint16_t)((uint32_t)((int8_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7f)) << 8) ;
         value |= (uint16_t)(((int8_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0x7f)) & 0xff) ;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (uint16_t)(((int8_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7f)) & 0xff) ;
         value |= (uint16_t)((uint32_t)((int8_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0x7f)) << 8) ;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r8g8_snorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         int16_t r = ((int16_t)(value) ) >> 8;
         int16_t g = ((int16_t)(value << 8) ) >> 8;
         dst[0] = (float)(r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(g * (1.0f/0x7f)); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         int16_t r = ((int16_t)(value << 8) ) >> 8;
         int16_t g = ((int16_t)(value) ) >> 8;
         dst[0] = (float)(r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(g * (1.0f/0x7f)); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_r8g8_snorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         int16_t r = ((int16_t)(value) ) >> 8;
         int16_t g = ((int16_t)(value << 8) ) >> 8;
         dst[0] = _mesa_snorm_to_unorm(MAX2(r, 0), 8, 8); /* r */
         dst[1] = _mesa_snorm_to_unorm(MAX2(g, 0), 8, 8); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         int16_t r = ((int16_t)(value << 8) ) >> 8;
         int16_t g = ((int16_t)(value) ) >> 8;
         dst[0] = _mesa_snorm_to_unorm(MAX2(r, 0), 8, 8); /* r */
         dst[1] = _mesa_snorm_to_unorm(MAX2(g, 0), 8, 8); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_r8g8_snorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint16_t)((uint32_t)(_mesa_unorm_to_snorm(src[0], 8, 8)) << 8) ;
         value |= (uint16_t)((_mesa_unorm_to_snorm(src[1], 8, 8)) & 0xff) ;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (uint16_t)((_mesa_unorm_to_snorm(src[0], 8, 8)) & 0xff) ;
         value |= (uint16_t)((uint32_t)(_mesa_unorm_to_snorm(src[1], 8, 8)) << 8) ;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r8g8b8_snorm {
#if UTIL_ARCH_BIG_ENDIAN
   int8_t r;
   int8_t g;
   int8_t b;
#else
   int8_t r;
   int8_t g;
   int8_t b;
#endif
};

void
util_format_r8g8b8_snorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(pixel.g * (1.0f/0x7f)); /* g */
         dst[2] = (float)(pixel.b * (1.0f/0x7f)); /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r8g8b8_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(pixel.g * (1.0f/0x7f)); /* g */
         dst[2] = (float)(pixel.b * (1.0f/0x7f)); /* b */
         dst[3] = 1; /* a */
#endif
      src += 3;
      dst += 4;
   }
}

void
util_format_r8g8b8_snorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_snorm pixel = {0};
         pixel.r = (int8_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7f);
         pixel.g = (int8_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0x7f);
         pixel.b = (int8_t)util_iround(CLAMP(src[2], -1.0f, 1.0f) * 0x7f);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r8g8b8_snorm pixel = {0};
         pixel.r = (int8_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7f);
         pixel.g = (int8_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0x7f);
         pixel.b = (int8_t)util_iround(CLAMP(src[2], -1.0f, 1.0f) * 0x7f);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 3;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r8g8b8_snorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(pixel.g * (1.0f/0x7f)); /* g */
         dst[2] = (float)(pixel.b * (1.0f/0x7f)); /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r8g8b8_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(pixel.g * (1.0f/0x7f)); /* g */
         dst[2] = (float)(pixel.b * (1.0f/0x7f)); /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_r8g8b8_snorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = _mesa_snorm_to_unorm(MAX2(pixel.r, 0), 8, 8); /* r */
         dst[1] = _mesa_snorm_to_unorm(MAX2(pixel.g, 0), 8, 8); /* g */
         dst[2] = _mesa_snorm_to_unorm(MAX2(pixel.b, 0), 8, 8); /* b */
         dst[3] = 255; /* a */
#else
         struct util_format_r8g8b8_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = _mesa_snorm_to_unorm(MAX2(pixel.r, 0), 8, 8); /* r */
         dst[1] = _mesa_snorm_to_unorm(MAX2(pixel.g, 0), 8, 8); /* g */
         dst[2] = _mesa_snorm_to_unorm(MAX2(pixel.b, 0), 8, 8); /* b */
         dst[3] = 255; /* a */
#endif
      src += 3;
      dst += 4;
   }
}

void
util_format_r8g8b8_snorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_snorm pixel = {0};
         pixel.r = _mesa_unorm_to_snorm(src[0], 8, 8);
         pixel.g = _mesa_unorm_to_snorm(src[1], 8, 8);
         pixel.b = _mesa_unorm_to_snorm(src[2], 8, 8);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r8g8b8_snorm pixel = {0};
         pixel.r = _mesa_unorm_to_snorm(src[0], 8, 8);
         pixel.g = _mesa_unorm_to_snorm(src[1], 8, 8);
         pixel.b = _mesa_unorm_to_snorm(src[2], 8, 8);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 3;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_b8g8r8_snorm {
#if UTIL_ARCH_BIG_ENDIAN
   int8_t b;
   int8_t g;
   int8_t r;
#else
   int8_t b;
   int8_t g;
   int8_t r;
#endif
};

void
util_format_b8g8r8_snorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(pixel.g * (1.0f/0x7f)); /* g */
         dst[2] = (float)(pixel.b * (1.0f/0x7f)); /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_b8g8r8_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(pixel.g * (1.0f/0x7f)); /* g */
         dst[2] = (float)(pixel.b * (1.0f/0x7f)); /* b */
         dst[3] = 1; /* a */
#endif
      src += 3;
      dst += 4;
   }
}

void
util_format_b8g8r8_snorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_snorm pixel = {0};
         pixel.b = (int8_t)util_iround(CLAMP(src[2], -1.0f, 1.0f) * 0x7f);
         pixel.g = (int8_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0x7f);
         pixel.r = (int8_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7f);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_b8g8r8_snorm pixel = {0};
         pixel.b = (int8_t)util_iround(CLAMP(src[2], -1.0f, 1.0f) * 0x7f);
         pixel.g = (int8_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0x7f);
         pixel.r = (int8_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7f);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 3;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b8g8r8_snorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(pixel.g * (1.0f/0x7f)); /* g */
         dst[2] = (float)(pixel.b * (1.0f/0x7f)); /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_b8g8r8_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(pixel.g * (1.0f/0x7f)); /* g */
         dst[2] = (float)(pixel.b * (1.0f/0x7f)); /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_b8g8r8_snorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = _mesa_snorm_to_unorm(MAX2(pixel.r, 0), 8, 8); /* r */
         dst[1] = _mesa_snorm_to_unorm(MAX2(pixel.g, 0), 8, 8); /* g */
         dst[2] = _mesa_snorm_to_unorm(MAX2(pixel.b, 0), 8, 8); /* b */
         dst[3] = 255; /* a */
#else
         struct util_format_b8g8r8_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = _mesa_snorm_to_unorm(MAX2(pixel.r, 0), 8, 8); /* r */
         dst[1] = _mesa_snorm_to_unorm(MAX2(pixel.g, 0), 8, 8); /* g */
         dst[2] = _mesa_snorm_to_unorm(MAX2(pixel.b, 0), 8, 8); /* b */
         dst[3] = 255; /* a */
#endif
      src += 3;
      dst += 4;
   }
}

void
util_format_b8g8r8_snorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_snorm pixel = {0};
         pixel.b = _mesa_unorm_to_snorm(src[2], 8, 8);
         pixel.g = _mesa_unorm_to_snorm(src[1], 8, 8);
         pixel.r = _mesa_unorm_to_snorm(src[0], 8, 8);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_b8g8r8_snorm pixel = {0};
         pixel.b = _mesa_unorm_to_snorm(src[2], 8, 8);
         pixel.g = _mesa_unorm_to_snorm(src[1], 8, 8);
         pixel.r = _mesa_unorm_to_snorm(src[0], 8, 8);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 3;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r8g8b8a8_snorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t r = ((int32_t)(value) ) >> 24;
         int32_t g = ((int32_t)(value << 8) ) >> 24;
         int32_t b = ((int32_t)(value << 16) ) >> 24;
         int32_t a = ((int32_t)(value << 24) ) >> 24;
         dst[0] = (float)(r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(g * (1.0f/0x7f)); /* g */
         dst[2] = (float)(b * (1.0f/0x7f)); /* b */
         dst[3] = (float)(a * (1.0f/0x7f)); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r = ((int32_t)(value << 24) ) >> 24;
         int32_t g = ((int32_t)(value << 16) ) >> 24;
         int32_t b = ((int32_t)(value << 8) ) >> 24;
         int32_t a = ((int32_t)(value) ) >> 24;
         dst[0] = (float)(r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(g * (1.0f/0x7f)); /* g */
         dst[2] = (float)(b * (1.0f/0x7f)); /* b */
         dst[3] = (float)(a * (1.0f/0x7f)); /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_r8g8b8a8_snorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((int8_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7f)) << 24) ;
         value |= (uint32_t)((uint32_t)(((int8_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0x7f)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)(((int8_t)util_iround(CLAMP(src[2], -1.0f, 1.0f) * 0x7f)) & 0xff) << 8) ;
         value |= (uint32_t)(((int8_t)util_iround(CLAMP(src[3], -1.0f, 1.0f) * 0x7f)) & 0xff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((int8_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7f)) & 0xff) ;
         value |= (uint32_t)((uint32_t)(((int8_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0x7f)) & 0xff) << 8) ;
         value |= (uint32_t)((uint32_t)(((int8_t)util_iround(CLAMP(src[2], -1.0f, 1.0f) * 0x7f)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)((int8_t)util_iround(CLAMP(src[3], -1.0f, 1.0f) * 0x7f)) << 24) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r8g8b8a8_snorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t r = ((int32_t)(value) ) >> 24;
         int32_t g = ((int32_t)(value << 8) ) >> 24;
         int32_t b = ((int32_t)(value << 16) ) >> 24;
         int32_t a = ((int32_t)(value << 24) ) >> 24;
         dst[0] = (float)(r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(g * (1.0f/0x7f)); /* g */
         dst[2] = (float)(b * (1.0f/0x7f)); /* b */
         dst[3] = (float)(a * (1.0f/0x7f)); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r = ((int32_t)(value << 24) ) >> 24;
         int32_t g = ((int32_t)(value << 16) ) >> 24;
         int32_t b = ((int32_t)(value << 8) ) >> 24;
         int32_t a = ((int32_t)(value) ) >> 24;
         dst[0] = (float)(r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(g * (1.0f/0x7f)); /* g */
         dst[2] = (float)(b * (1.0f/0x7f)); /* b */
         dst[3] = (float)(a * (1.0f/0x7f)); /* a */
#endif
}

void
util_format_r8g8b8a8_snorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t r = ((int32_t)(value) ) >> 24;
         int32_t g = ((int32_t)(value << 8) ) >> 24;
         int32_t b = ((int32_t)(value << 16) ) >> 24;
         int32_t a = ((int32_t)(value << 24) ) >> 24;
         dst[0] = _mesa_snorm_to_unorm(MAX2(r, 0), 8, 8); /* r */
         dst[1] = _mesa_snorm_to_unorm(MAX2(g, 0), 8, 8); /* g */
         dst[2] = _mesa_snorm_to_unorm(MAX2(b, 0), 8, 8); /* b */
         dst[3] = _mesa_snorm_to_unorm(MAX2(a, 0), 8, 8); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r = ((int32_t)(value << 24) ) >> 24;
         int32_t g = ((int32_t)(value << 16) ) >> 24;
         int32_t b = ((int32_t)(value << 8) ) >> 24;
         int32_t a = ((int32_t)(value) ) >> 24;
         dst[0] = _mesa_snorm_to_unorm(MAX2(r, 0), 8, 8); /* r */
         dst[1] = _mesa_snorm_to_unorm(MAX2(g, 0), 8, 8); /* g */
         dst[2] = _mesa_snorm_to_unorm(MAX2(b, 0), 8, 8); /* b */
         dst[3] = _mesa_snorm_to_unorm(MAX2(a, 0), 8, 8); /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_r8g8b8a8_snorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)(_mesa_unorm_to_snorm(src[0], 8, 8)) << 24) ;
         value |= (uint32_t)((uint32_t)((_mesa_unorm_to_snorm(src[1], 8, 8)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)((_mesa_unorm_to_snorm(src[2], 8, 8)) & 0xff) << 8) ;
         value |= (uint32_t)((_mesa_unorm_to_snorm(src[3], 8, 8)) & 0xff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)((_mesa_unorm_to_snorm(src[0], 8, 8)) & 0xff) ;
         value |= (uint32_t)((uint32_t)((_mesa_unorm_to_snorm(src[1], 8, 8)) & 0xff) << 8) ;
         value |= (uint32_t)((uint32_t)((_mesa_unorm_to_snorm(src[2], 8, 8)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)(_mesa_unorm_to_snorm(src[3], 8, 8)) << 24) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b8g8r8a8_snorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t b = ((int32_t)(value) ) >> 24;
         int32_t g = ((int32_t)(value << 8) ) >> 24;
         int32_t r = ((int32_t)(value << 16) ) >> 24;
         int32_t a = ((int32_t)(value << 24) ) >> 24;
         dst[0] = (float)(r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(g * (1.0f/0x7f)); /* g */
         dst[2] = (float)(b * (1.0f/0x7f)); /* b */
         dst[3] = (float)(a * (1.0f/0x7f)); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t b = ((int32_t)(value << 24) ) >> 24;
         int32_t g = ((int32_t)(value << 16) ) >> 24;
         int32_t r = ((int32_t)(value << 8) ) >> 24;
         int32_t a = ((int32_t)(value) ) >> 24;
         dst[0] = (float)(r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(g * (1.0f/0x7f)); /* g */
         dst[2] = (float)(b * (1.0f/0x7f)); /* b */
         dst[3] = (float)(a * (1.0f/0x7f)); /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_b8g8r8a8_snorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((int8_t)util_iround(CLAMP(src[2], -1.0f, 1.0f) * 0x7f)) << 24) ;
         value |= (uint32_t)((uint32_t)(((int8_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0x7f)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)(((int8_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7f)) & 0xff) << 8) ;
         value |= (uint32_t)(((int8_t)util_iround(CLAMP(src[3], -1.0f, 1.0f) * 0x7f)) & 0xff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((int8_t)util_iround(CLAMP(src[2], -1.0f, 1.0f) * 0x7f)) & 0xff) ;
         value |= (uint32_t)((uint32_t)(((int8_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0x7f)) & 0xff) << 8) ;
         value |= (uint32_t)((uint32_t)(((int8_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7f)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)((int8_t)util_iround(CLAMP(src[3], -1.0f, 1.0f) * 0x7f)) << 24) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b8g8r8a8_snorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t b = ((int32_t)(value) ) >> 24;
         int32_t g = ((int32_t)(value << 8) ) >> 24;
         int32_t r = ((int32_t)(value << 16) ) >> 24;
         int32_t a = ((int32_t)(value << 24) ) >> 24;
         dst[0] = (float)(r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(g * (1.0f/0x7f)); /* g */
         dst[2] = (float)(b * (1.0f/0x7f)); /* b */
         dst[3] = (float)(a * (1.0f/0x7f)); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t b = ((int32_t)(value << 24) ) >> 24;
         int32_t g = ((int32_t)(value << 16) ) >> 24;
         int32_t r = ((int32_t)(value << 8) ) >> 24;
         int32_t a = ((int32_t)(value) ) >> 24;
         dst[0] = (float)(r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(g * (1.0f/0x7f)); /* g */
         dst[2] = (float)(b * (1.0f/0x7f)); /* b */
         dst[3] = (float)(a * (1.0f/0x7f)); /* a */
#endif
}

void
util_format_b8g8r8a8_snorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t b = ((int32_t)(value) ) >> 24;
         int32_t g = ((int32_t)(value << 8) ) >> 24;
         int32_t r = ((int32_t)(value << 16) ) >> 24;
         int32_t a = ((int32_t)(value << 24) ) >> 24;
         dst[0] = _mesa_snorm_to_unorm(MAX2(r, 0), 8, 8); /* r */
         dst[1] = _mesa_snorm_to_unorm(MAX2(g, 0), 8, 8); /* g */
         dst[2] = _mesa_snorm_to_unorm(MAX2(b, 0), 8, 8); /* b */
         dst[3] = _mesa_snorm_to_unorm(MAX2(a, 0), 8, 8); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t b = ((int32_t)(value << 24) ) >> 24;
         int32_t g = ((int32_t)(value << 16) ) >> 24;
         int32_t r = ((int32_t)(value << 8) ) >> 24;
         int32_t a = ((int32_t)(value) ) >> 24;
         dst[0] = _mesa_snorm_to_unorm(MAX2(r, 0), 8, 8); /* r */
         dst[1] = _mesa_snorm_to_unorm(MAX2(g, 0), 8, 8); /* g */
         dst[2] = _mesa_snorm_to_unorm(MAX2(b, 0), 8, 8); /* b */
         dst[3] = _mesa_snorm_to_unorm(MAX2(a, 0), 8, 8); /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_b8g8r8a8_snorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)(_mesa_unorm_to_snorm(src[2], 8, 8)) << 24) ;
         value |= (uint32_t)((uint32_t)((_mesa_unorm_to_snorm(src[1], 8, 8)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)((_mesa_unorm_to_snorm(src[0], 8, 8)) & 0xff) << 8) ;
         value |= (uint32_t)((_mesa_unorm_to_snorm(src[3], 8, 8)) & 0xff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)((_mesa_unorm_to_snorm(src[2], 8, 8)) & 0xff) ;
         value |= (uint32_t)((uint32_t)((_mesa_unorm_to_snorm(src[1], 8, 8)) & 0xff) << 8) ;
         value |= (uint32_t)((uint32_t)((_mesa_unorm_to_snorm(src[0], 8, 8)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)(_mesa_unorm_to_snorm(src[3], 8, 8)) << 24) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r8_sscaled_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         int8_t r = (int8_t)(value) ;
         dst[0] = (float)r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
      src += 1;
      dst += 4;
   }
}

void
util_format_r8_sscaled_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= (uint8_t)((int8_t)CLAMP(src[0], -128.0f, 127.0f)) ;
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r8_sscaled_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
         uint8_t value = *(const uint8_t *)src;
         int8_t r = (int8_t)(value) ;
         dst[0] = (float)r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
}

void
util_format_r8_sscaled_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         int8_t r = (int8_t)(value) ;
         dst[0] = (uint8_t)(((uint32_t)CLAMP(r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
      src += 1;
      dst += 4;
   }
}

void
util_format_r8_sscaled_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= (uint8_t)((int8_t)(((uint32_t)src[0]) * 0x1 / 0xff)) ;
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r8g8_sscaled_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         int16_t r = ((int16_t)(value) ) >> 8;
         int16_t g = ((int16_t)(value << 8) ) >> 8;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         int16_t r = ((int16_t)(value << 8) ) >> 8;
         int16_t g = ((int16_t)(value) ) >> 8;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_r8g8_sscaled_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint16_t)((uint32_t)((int8_t)CLAMP(src[0], -128.0f, 127.0f)) << 8) ;
         value |= (uint16_t)(((int8_t)CLAMP(src[1], -128.0f, 127.0f)) & 0xff) ;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (uint16_t)(((int8_t)CLAMP(src[0], -128.0f, 127.0f)) & 0xff) ;
         value |= (uint16_t)((uint32_t)((int8_t)CLAMP(src[1], -128.0f, 127.0f)) << 8) ;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r8g8_sscaled_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         int16_t r = ((int16_t)(value) ) >> 8;
         int16_t g = ((int16_t)(value << 8) ) >> 8;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         int16_t r = ((int16_t)(value << 8) ) >> 8;
         int16_t g = ((int16_t)(value) ) >> 8;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_r8g8_sscaled_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         int16_t r = ((int16_t)(value) ) >> 8;
         int16_t g = ((int16_t)(value << 8) ) >> 8;
         dst[0] = (uint8_t)(((uint32_t)CLAMP(r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)CLAMP(g, 0, 1)) * 0xff / 0x1); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         int16_t r = ((int16_t)(value << 8) ) >> 8;
         int16_t g = ((int16_t)(value) ) >> 8;
         dst[0] = (uint8_t)(((uint32_t)CLAMP(r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)CLAMP(g, 0, 1)) * 0xff / 0x1); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_r8g8_sscaled_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint16_t)((uint32_t)((int8_t)(((uint32_t)src[0]) * 0x1 / 0xff)) << 8) ;
         value |= (uint16_t)(((int8_t)(((uint32_t)src[1]) * 0x1 / 0xff)) & 0xff) ;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (uint16_t)(((int8_t)(((uint32_t)src[0]) * 0x1 / 0xff)) & 0xff) ;
         value |= (uint16_t)((uint32_t)((int8_t)(((uint32_t)src[1]) * 0x1 / 0xff)) << 8) ;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r8g8b8_sscaled {
#if UTIL_ARCH_BIG_ENDIAN
   int8_t r;
   int8_t g;
   int8_t b;
#else
   int8_t r;
   int8_t g;
   int8_t b;
#endif
};

void
util_format_r8g8b8_sscaled_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r8g8b8_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
      src += 3;
      dst += 4;
   }
}

void
util_format_r8g8b8_sscaled_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_sscaled pixel = {0};
         pixel.r = (int8_t)CLAMP(src[0], -128.0f, 127.0f);
         pixel.g = (int8_t)CLAMP(src[1], -128.0f, 127.0f);
         pixel.b = (int8_t)CLAMP(src[2], -128.0f, 127.0f);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r8g8b8_sscaled pixel = {0};
         pixel.r = (int8_t)CLAMP(src[0], -128.0f, 127.0f);
         pixel.g = (int8_t)CLAMP(src[1], -128.0f, 127.0f);
         pixel.b = (int8_t)CLAMP(src[2], -128.0f, 127.0f);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 3;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r8g8b8_sscaled_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r8g8b8_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_r8g8b8_sscaled_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(((uint32_t)CLAMP(pixel.r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)CLAMP(pixel.g, 0, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)CLAMP(pixel.b, 0, 1)) * 0xff / 0x1); /* b */
         dst[3] = 255; /* a */
#else
         struct util_format_r8g8b8_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(((uint32_t)CLAMP(pixel.r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)CLAMP(pixel.g, 0, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)CLAMP(pixel.b, 0, 1)) * 0xff / 0x1); /* b */
         dst[3] = 255; /* a */
#endif
      src += 3;
      dst += 4;
   }
}

void
util_format_r8g8b8_sscaled_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_sscaled pixel = {0};
         pixel.r = (int8_t)(((uint32_t)src[0]) * 0x1 / 0xff);
         pixel.g = (int8_t)(((uint32_t)src[1]) * 0x1 / 0xff);
         pixel.b = (int8_t)(((uint32_t)src[2]) * 0x1 / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r8g8b8_sscaled pixel = {0};
         pixel.r = (int8_t)(((uint32_t)src[0]) * 0x1 / 0xff);
         pixel.g = (int8_t)(((uint32_t)src[1]) * 0x1 / 0xff);
         pixel.b = (int8_t)(((uint32_t)src[2]) * 0x1 / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 3;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_b8g8r8_sscaled {
#if UTIL_ARCH_BIG_ENDIAN
   int8_t b;
   int8_t g;
   int8_t r;
#else
   int8_t b;
   int8_t g;
   int8_t r;
#endif
};

void
util_format_b8g8r8_sscaled_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_b8g8r8_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
      src += 3;
      dst += 4;
   }
}

void
util_format_b8g8r8_sscaled_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_sscaled pixel = {0};
         pixel.b = (int8_t)CLAMP(src[2], -128.0f, 127.0f);
         pixel.g = (int8_t)CLAMP(src[1], -128.0f, 127.0f);
         pixel.r = (int8_t)CLAMP(src[0], -128.0f, 127.0f);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_b8g8r8_sscaled pixel = {0};
         pixel.b = (int8_t)CLAMP(src[2], -128.0f, 127.0f);
         pixel.g = (int8_t)CLAMP(src[1], -128.0f, 127.0f);
         pixel.r = (int8_t)CLAMP(src[0], -128.0f, 127.0f);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 3;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b8g8r8_sscaled_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_b8g8r8_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_b8g8r8_sscaled_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(((uint32_t)CLAMP(pixel.r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)CLAMP(pixel.g, 0, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)CLAMP(pixel.b, 0, 1)) * 0xff / 0x1); /* b */
         dst[3] = 255; /* a */
#else
         struct util_format_b8g8r8_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(((uint32_t)CLAMP(pixel.r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)CLAMP(pixel.g, 0, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)CLAMP(pixel.b, 0, 1)) * 0xff / 0x1); /* b */
         dst[3] = 255; /* a */
#endif
      src += 3;
      dst += 4;
   }
}

void
util_format_b8g8r8_sscaled_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_sscaled pixel = {0};
         pixel.b = (int8_t)(((uint32_t)src[2]) * 0x1 / 0xff);
         pixel.g = (int8_t)(((uint32_t)src[1]) * 0x1 / 0xff);
         pixel.r = (int8_t)(((uint32_t)src[0]) * 0x1 / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_b8g8r8_sscaled pixel = {0};
         pixel.b = (int8_t)(((uint32_t)src[2]) * 0x1 / 0xff);
         pixel.g = (int8_t)(((uint32_t)src[1]) * 0x1 / 0xff);
         pixel.r = (int8_t)(((uint32_t)src[0]) * 0x1 / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 3;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r8g8b8a8_sscaled_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t r = ((int32_t)(value) ) >> 24;
         int32_t g = ((int32_t)(value << 8) ) >> 24;
         int32_t b = ((int32_t)(value << 16) ) >> 24;
         int32_t a = ((int32_t)(value << 24) ) >> 24;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r = ((int32_t)(value << 24) ) >> 24;
         int32_t g = ((int32_t)(value << 16) ) >> 24;
         int32_t b = ((int32_t)(value << 8) ) >> 24;
         int32_t a = ((int32_t)(value) ) >> 24;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_r8g8b8a8_sscaled_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((int8_t)CLAMP(src[0], -128.0f, 127.0f)) << 24) ;
         value |= (uint32_t)((uint32_t)(((int8_t)CLAMP(src[1], -128.0f, 127.0f)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)(((int8_t)CLAMP(src[2], -128.0f, 127.0f)) & 0xff) << 8) ;
         value |= (uint32_t)(((int8_t)CLAMP(src[3], -128.0f, 127.0f)) & 0xff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((int8_t)CLAMP(src[0], -128.0f, 127.0f)) & 0xff) ;
         value |= (uint32_t)((uint32_t)(((int8_t)CLAMP(src[1], -128.0f, 127.0f)) & 0xff) << 8) ;
         value |= (uint32_t)((uint32_t)(((int8_t)CLAMP(src[2], -128.0f, 127.0f)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)((int8_t)CLAMP(src[3], -128.0f, 127.0f)) << 24) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r8g8b8a8_sscaled_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t r = ((int32_t)(value) ) >> 24;
         int32_t g = ((int32_t)(value << 8) ) >> 24;
         int32_t b = ((int32_t)(value << 16) ) >> 24;
         int32_t a = ((int32_t)(value << 24) ) >> 24;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r = ((int32_t)(value << 24) ) >> 24;
         int32_t g = ((int32_t)(value << 16) ) >> 24;
         int32_t b = ((int32_t)(value << 8) ) >> 24;
         int32_t a = ((int32_t)(value) ) >> 24;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#endif
}

void
util_format_r8g8b8a8_sscaled_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t r = ((int32_t)(value) ) >> 24;
         int32_t g = ((int32_t)(value << 8) ) >> 24;
         int32_t b = ((int32_t)(value << 16) ) >> 24;
         int32_t a = ((int32_t)(value << 24) ) >> 24;
         dst[0] = (uint8_t)(((uint32_t)CLAMP(r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)CLAMP(g, 0, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)CLAMP(b, 0, 1)) * 0xff / 0x1); /* b */
         dst[3] = (uint8_t)(((uint32_t)CLAMP(a, 0, 1)) * 0xff / 0x1); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r = ((int32_t)(value << 24) ) >> 24;
         int32_t g = ((int32_t)(value << 16) ) >> 24;
         int32_t b = ((int32_t)(value << 8) ) >> 24;
         int32_t a = ((int32_t)(value) ) >> 24;
         dst[0] = (uint8_t)(((uint32_t)CLAMP(r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)CLAMP(g, 0, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)CLAMP(b, 0, 1)) * 0xff / 0x1); /* b */
         dst[3] = (uint8_t)(((uint32_t)CLAMP(a, 0, 1)) * 0xff / 0x1); /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_r8g8b8a8_sscaled_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((int8_t)(((uint32_t)src[0]) * 0x1 / 0xff)) << 24) ;
         value |= (uint32_t)((uint32_t)(((int8_t)(((uint32_t)src[1]) * 0x1 / 0xff)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)(((int8_t)(((uint32_t)src[2]) * 0x1 / 0xff)) & 0xff) << 8) ;
         value |= (uint32_t)(((int8_t)(((uint32_t)src[3]) * 0x1 / 0xff)) & 0xff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((int8_t)(((uint32_t)src[0]) * 0x1 / 0xff)) & 0xff) ;
         value |= (uint32_t)((uint32_t)(((int8_t)(((uint32_t)src[1]) * 0x1 / 0xff)) & 0xff) << 8) ;
         value |= (uint32_t)((uint32_t)(((int8_t)(((uint32_t)src[2]) * 0x1 / 0xff)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)((int8_t)(((uint32_t)src[3]) * 0x1 / 0xff)) << 24) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b8g8r8a8_sscaled_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t b = ((int32_t)(value) ) >> 24;
         int32_t g = ((int32_t)(value << 8) ) >> 24;
         int32_t r = ((int32_t)(value << 16) ) >> 24;
         int32_t a = ((int32_t)(value << 24) ) >> 24;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t b = ((int32_t)(value << 24) ) >> 24;
         int32_t g = ((int32_t)(value << 16) ) >> 24;
         int32_t r = ((int32_t)(value << 8) ) >> 24;
         int32_t a = ((int32_t)(value) ) >> 24;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_b8g8r8a8_sscaled_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((int8_t)CLAMP(src[2], -128.0f, 127.0f)) << 24) ;
         value |= (uint32_t)((uint32_t)(((int8_t)CLAMP(src[1], -128.0f, 127.0f)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)(((int8_t)CLAMP(src[0], -128.0f, 127.0f)) & 0xff) << 8) ;
         value |= (uint32_t)(((int8_t)CLAMP(src[3], -128.0f, 127.0f)) & 0xff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((int8_t)CLAMP(src[2], -128.0f, 127.0f)) & 0xff) ;
         value |= (uint32_t)((uint32_t)(((int8_t)CLAMP(src[1], -128.0f, 127.0f)) & 0xff) << 8) ;
         value |= (uint32_t)((uint32_t)(((int8_t)CLAMP(src[0], -128.0f, 127.0f)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)((int8_t)CLAMP(src[3], -128.0f, 127.0f)) << 24) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b8g8r8a8_sscaled_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t b = ((int32_t)(value) ) >> 24;
         int32_t g = ((int32_t)(value << 8) ) >> 24;
         int32_t r = ((int32_t)(value << 16) ) >> 24;
         int32_t a = ((int32_t)(value << 24) ) >> 24;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t b = ((int32_t)(value << 24) ) >> 24;
         int32_t g = ((int32_t)(value << 16) ) >> 24;
         int32_t r = ((int32_t)(value << 8) ) >> 24;
         int32_t a = ((int32_t)(value) ) >> 24;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#endif
}

void
util_format_b8g8r8a8_sscaled_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t b = ((int32_t)(value) ) >> 24;
         int32_t g = ((int32_t)(value << 8) ) >> 24;
         int32_t r = ((int32_t)(value << 16) ) >> 24;
         int32_t a = ((int32_t)(value << 24) ) >> 24;
         dst[0] = (uint8_t)(((uint32_t)CLAMP(r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)CLAMP(g, 0, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)CLAMP(b, 0, 1)) * 0xff / 0x1); /* b */
         dst[3] = (uint8_t)(((uint32_t)CLAMP(a, 0, 1)) * 0xff / 0x1); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t b = ((int32_t)(value << 24) ) >> 24;
         int32_t g = ((int32_t)(value << 16) ) >> 24;
         int32_t r = ((int32_t)(value << 8) ) >> 24;
         int32_t a = ((int32_t)(value) ) >> 24;
         dst[0] = (uint8_t)(((uint32_t)CLAMP(r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)CLAMP(g, 0, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)CLAMP(b, 0, 1)) * 0xff / 0x1); /* b */
         dst[3] = (uint8_t)(((uint32_t)CLAMP(a, 0, 1)) * 0xff / 0x1); /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_b8g8r8a8_sscaled_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((int8_t)(((uint32_t)src[2]) * 0x1 / 0xff)) << 24) ;
         value |= (uint32_t)((uint32_t)(((int8_t)(((uint32_t)src[1]) * 0x1 / 0xff)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)(((int8_t)(((uint32_t)src[0]) * 0x1 / 0xff)) & 0xff) << 8) ;
         value |= (uint32_t)(((int8_t)(((uint32_t)src[3]) * 0x1 / 0xff)) & 0xff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((int8_t)(((uint32_t)src[2]) * 0x1 / 0xff)) & 0xff) ;
         value |= (uint32_t)((uint32_t)(((int8_t)(((uint32_t)src[1]) * 0x1 / 0xff)) & 0xff) << 8) ;
         value |= (uint32_t)((uint32_t)(((int8_t)(((uint32_t)src[0]) * 0x1 / 0xff)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)((int8_t)(((uint32_t)src[3]) * 0x1 / 0xff)) << 24) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a8b8g8r8_sscaled_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t a = ((int32_t)(value) ) >> 24;
         int32_t b = ((int32_t)(value << 8) ) >> 24;
         int32_t g = ((int32_t)(value << 16) ) >> 24;
         int32_t r = ((int32_t)(value << 24) ) >> 24;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t a = ((int32_t)(value << 24) ) >> 24;
         int32_t b = ((int32_t)(value << 16) ) >> 24;
         int32_t g = ((int32_t)(value << 8) ) >> 24;
         int32_t r = ((int32_t)(value) ) >> 24;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_a8b8g8r8_sscaled_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((int8_t)CLAMP(src[3], -128.0f, 127.0f)) << 24) ;
         value |= (uint32_t)((uint32_t)(((int8_t)CLAMP(src[2], -128.0f, 127.0f)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)(((int8_t)CLAMP(src[1], -128.0f, 127.0f)) & 0xff) << 8) ;
         value |= (uint32_t)(((int8_t)CLAMP(src[0], -128.0f, 127.0f)) & 0xff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((int8_t)CLAMP(src[3], -128.0f, 127.0f)) & 0xff) ;
         value |= (uint32_t)((uint32_t)(((int8_t)CLAMP(src[2], -128.0f, 127.0f)) & 0xff) << 8) ;
         value |= (uint32_t)((uint32_t)(((int8_t)CLAMP(src[1], -128.0f, 127.0f)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)((int8_t)CLAMP(src[0], -128.0f, 127.0f)) << 24) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a8b8g8r8_sscaled_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t a = ((int32_t)(value) ) >> 24;
         int32_t b = ((int32_t)(value << 8) ) >> 24;
         int32_t g = ((int32_t)(value << 16) ) >> 24;
         int32_t r = ((int32_t)(value << 24) ) >> 24;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t a = ((int32_t)(value << 24) ) >> 24;
         int32_t b = ((int32_t)(value << 16) ) >> 24;
         int32_t g = ((int32_t)(value << 8) ) >> 24;
         int32_t r = ((int32_t)(value) ) >> 24;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#endif
}

void
util_format_a8b8g8r8_sscaled_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t a = ((int32_t)(value) ) >> 24;
         int32_t b = ((int32_t)(value << 8) ) >> 24;
         int32_t g = ((int32_t)(value << 16) ) >> 24;
         int32_t r = ((int32_t)(value << 24) ) >> 24;
         dst[0] = (uint8_t)(((uint32_t)CLAMP(r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)CLAMP(g, 0, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)CLAMP(b, 0, 1)) * 0xff / 0x1); /* b */
         dst[3] = (uint8_t)(((uint32_t)CLAMP(a, 0, 1)) * 0xff / 0x1); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t a = ((int32_t)(value << 24) ) >> 24;
         int32_t b = ((int32_t)(value << 16) ) >> 24;
         int32_t g = ((int32_t)(value << 8) ) >> 24;
         int32_t r = ((int32_t)(value) ) >> 24;
         dst[0] = (uint8_t)(((uint32_t)CLAMP(r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)CLAMP(g, 0, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)CLAMP(b, 0, 1)) * 0xff / 0x1); /* b */
         dst[3] = (uint8_t)(((uint32_t)CLAMP(a, 0, 1)) * 0xff / 0x1); /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_a8b8g8r8_sscaled_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((int8_t)(((uint32_t)src[3]) * 0x1 / 0xff)) << 24) ;
         value |= (uint32_t)((uint32_t)(((int8_t)(((uint32_t)src[2]) * 0x1 / 0xff)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)(((int8_t)(((uint32_t)src[1]) * 0x1 / 0xff)) & 0xff) << 8) ;
         value |= (uint32_t)(((int8_t)(((uint32_t)src[0]) * 0x1 / 0xff)) & 0xff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((int8_t)(((uint32_t)src[3]) * 0x1 / 0xff)) & 0xff) ;
         value |= (uint32_t)((uint32_t)(((int8_t)(((uint32_t)src[2]) * 0x1 / 0xff)) & 0xff) << 8) ;
         value |= (uint32_t)((uint32_t)(((int8_t)(((uint32_t)src[1]) * 0x1 / 0xff)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)((int8_t)(((uint32_t)src[0]) * 0x1 / 0xff)) << 24) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r32_fixed {
   int32_t r;
};

void
util_format_r32_fixed_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         struct util_format_r32_fixed pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0x10000)); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
      src += 4;
      dst += 4;
   }
}

void
util_format_r32_fixed_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_r32_fixed pixel = {0};
         pixel.r = (int32_t)(CLAMP(src[0], -65536.0f, 65535.0f) * (double)0x10000);
         memcpy(dst, &pixel, sizeof pixel);
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r32_fixed_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
         struct util_format_r32_fixed pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0x10000)); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
}

void
util_format_r32_fixed_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         struct util_format_r32_fixed pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)util_iround((CLAMP(pixel.r, 0, 65536) * (1.0/0x10000)) * 0xff); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
      src += 4;
      dst += 4;
   }
}

void
util_format_r32_fixed_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_r32_fixed pixel = {0};
         pixel.r = (int32_t)((float)(src[0] * (1.0f/0xff)) * (double)0x10000);
         memcpy(dst, &pixel, sizeof pixel);
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r32g32_fixed {
#if UTIL_ARCH_BIG_ENDIAN
   int32_t r;
   int32_t g;
#else
   int32_t r;
   int32_t g;
#endif
};

void
util_format_r32g32_fixed_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_fixed pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0x10000)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0x10000)); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32_fixed pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0x10000)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0x10000)); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
      src += 8;
      dst += 4;
   }
}

void
util_format_r32g32_fixed_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_fixed pixel = {0};
         pixel.r = (int32_t)(CLAMP(src[0], -65536.0f, 65535.0f) * (double)0x10000);
         pixel.g = (int32_t)(CLAMP(src[1], -65536.0f, 65535.0f) * (double)0x10000);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32_fixed pixel = {0};
         pixel.r = (int32_t)(CLAMP(src[0], -65536.0f, 65535.0f) * (double)0x10000);
         pixel.g = (int32_t)(CLAMP(src[1], -65536.0f, 65535.0f) * (double)0x10000);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r32g32_fixed_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_fixed pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0x10000)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0x10000)); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32_fixed pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0x10000)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0x10000)); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_r32g32_fixed_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_fixed pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)util_iround((CLAMP(pixel.r, 0, 65536) * (1.0/0x10000)) * 0xff); /* r */
         dst[1] = (uint8_t)util_iround((CLAMP(pixel.g, 0, 65536) * (1.0/0x10000)) * 0xff); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#else
         struct util_format_r32g32_fixed pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)util_iround((CLAMP(pixel.r, 0, 65536) * (1.0/0x10000)) * 0xff); /* r */
         dst[1] = (uint8_t)util_iround((CLAMP(pixel.g, 0, 65536) * (1.0/0x10000)) * 0xff); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#endif
      src += 8;
      dst += 4;
   }
}

void
util_format_r32g32_fixed_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_fixed pixel = {0};
         pixel.r = (int32_t)((float)(src[0] * (1.0f/0xff)) * (double)0x10000);
         pixel.g = (int32_t)((float)(src[1] * (1.0f/0xff)) * (double)0x10000);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32_fixed pixel = {0};
         pixel.r = (int32_t)((float)(src[0] * (1.0f/0xff)) * (double)0x10000);
         pixel.g = (int32_t)((float)(src[1] * (1.0f/0xff)) * (double)0x10000);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r32g32b32_fixed {
#if UTIL_ARCH_BIG_ENDIAN
   int32_t r;
   int32_t g;
   int32_t b;
#else
   int32_t r;
   int32_t g;
   int32_t b;
#endif
};

void
util_format_r32g32b32_fixed_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_fixed pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0x10000)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0x10000)); /* g */
         dst[2] = (float)(pixel.b * (1.0/0x10000)); /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32b32_fixed pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0x10000)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0x10000)); /* g */
         dst[2] = (float)(pixel.b * (1.0/0x10000)); /* b */
         dst[3] = 1; /* a */
#endif
      src += 12;
      dst += 4;
   }
}

void
util_format_r32g32b32_fixed_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_fixed pixel = {0};
         pixel.r = (int32_t)(CLAMP(src[0], -65536.0f, 65535.0f) * (double)0x10000);
         pixel.g = (int32_t)(CLAMP(src[1], -65536.0f, 65535.0f) * (double)0x10000);
         pixel.b = (int32_t)(CLAMP(src[2], -65536.0f, 65535.0f) * (double)0x10000);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32_fixed pixel = {0};
         pixel.r = (int32_t)(CLAMP(src[0], -65536.0f, 65535.0f) * (double)0x10000);
         pixel.g = (int32_t)(CLAMP(src[1], -65536.0f, 65535.0f) * (double)0x10000);
         pixel.b = (int32_t)(CLAMP(src[2], -65536.0f, 65535.0f) * (double)0x10000);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 12;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r32g32b32_fixed_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_fixed pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0x10000)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0x10000)); /* g */
         dst[2] = (float)(pixel.b * (1.0/0x10000)); /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32b32_fixed pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0x10000)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0x10000)); /* g */
         dst[2] = (float)(pixel.b * (1.0/0x10000)); /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_r32g32b32_fixed_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_fixed pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)util_iround((CLAMP(pixel.r, 0, 65536) * (1.0/0x10000)) * 0xff); /* r */
         dst[1] = (uint8_t)util_iround((CLAMP(pixel.g, 0, 65536) * (1.0/0x10000)) * 0xff); /* g */
         dst[2] = (uint8_t)util_iround((CLAMP(pixel.b, 0, 65536) * (1.0/0x10000)) * 0xff); /* b */
         dst[3] = 255; /* a */
#else
         struct util_format_r32g32b32_fixed pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)util_iround((CLAMP(pixel.r, 0, 65536) * (1.0/0x10000)) * 0xff); /* r */
         dst[1] = (uint8_t)util_iround((CLAMP(pixel.g, 0, 65536) * (1.0/0x10000)) * 0xff); /* g */
         dst[2] = (uint8_t)util_iround((CLAMP(pixel.b, 0, 65536) * (1.0/0x10000)) * 0xff); /* b */
         dst[3] = 255; /* a */
#endif
      src += 12;
      dst += 4;
   }
}

void
util_format_r32g32b32_fixed_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_fixed pixel = {0};
         pixel.r = (int32_t)((float)(src[0] * (1.0f/0xff)) * (double)0x10000);
         pixel.g = (int32_t)((float)(src[1] * (1.0f/0xff)) * (double)0x10000);
         pixel.b = (int32_t)((float)(src[2] * (1.0f/0xff)) * (double)0x10000);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32_fixed pixel = {0};
         pixel.r = (int32_t)((float)(src[0] * (1.0f/0xff)) * (double)0x10000);
         pixel.g = (int32_t)((float)(src[1] * (1.0f/0xff)) * (double)0x10000);
         pixel.b = (int32_t)((float)(src[2] * (1.0f/0xff)) * (double)0x10000);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 12;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r32g32b32a32_fixed {
#if UTIL_ARCH_BIG_ENDIAN
   int32_t r;
   int32_t g;
   int32_t b;
   int32_t a;
#else
   int32_t r;
   int32_t g;
   int32_t b;
   int32_t a;
#endif
};

void
util_format_r32g32b32a32_fixed_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_fixed pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0x10000)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0x10000)); /* g */
         dst[2] = (float)(pixel.b * (1.0/0x10000)); /* b */
         dst[3] = (float)(pixel.a * (1.0/0x10000)); /* a */
#else
         struct util_format_r32g32b32a32_fixed pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0x10000)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0x10000)); /* g */
         dst[2] = (float)(pixel.b * (1.0/0x10000)); /* b */
         dst[3] = (float)(pixel.a * (1.0/0x10000)); /* a */
#endif
      src += 16;
      dst += 4;
   }
}

void
util_format_r32g32b32a32_fixed_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_fixed pixel = {0};
         pixel.r = (int32_t)(CLAMP(src[0], -65536.0f, 65535.0f) * (double)0x10000);
         pixel.g = (int32_t)(CLAMP(src[1], -65536.0f, 65535.0f) * (double)0x10000);
         pixel.b = (int32_t)(CLAMP(src[2], -65536.0f, 65535.0f) * (double)0x10000);
         pixel.a = (int32_t)(CLAMP(src[3], -65536.0f, 65535.0f) * (double)0x10000);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32a32_fixed pixel = {0};
         pixel.r = (int32_t)(CLAMP(src[0], -65536.0f, 65535.0f) * (double)0x10000);
         pixel.g = (int32_t)(CLAMP(src[1], -65536.0f, 65535.0f) * (double)0x10000);
         pixel.b = (int32_t)(CLAMP(src[2], -65536.0f, 65535.0f) * (double)0x10000);
         pixel.a = (int32_t)(CLAMP(src[3], -65536.0f, 65535.0f) * (double)0x10000);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 16;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r32g32b32a32_fixed_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_fixed pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0x10000)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0x10000)); /* g */
         dst[2] = (float)(pixel.b * (1.0/0x10000)); /* b */
         dst[3] = (float)(pixel.a * (1.0/0x10000)); /* a */
#else
         struct util_format_r32g32b32a32_fixed pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0x10000)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0x10000)); /* g */
         dst[2] = (float)(pixel.b * (1.0/0x10000)); /* b */
         dst[3] = (float)(pixel.a * (1.0/0x10000)); /* a */
#endif
}

void
util_format_r32g32b32a32_fixed_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_fixed pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)util_iround((CLAMP(pixel.r, 0, 65536) * (1.0/0x10000)) * 0xff); /* r */
         dst[1] = (uint8_t)util_iround((CLAMP(pixel.g, 0, 65536) * (1.0/0x10000)) * 0xff); /* g */
         dst[2] = (uint8_t)util_iround((CLAMP(pixel.b, 0, 65536) * (1.0/0x10000)) * 0xff); /* b */
         dst[3] = (uint8_t)util_iround((CLAMP(pixel.a, 0, 65536) * (1.0/0x10000)) * 0xff); /* a */
#else
         struct util_format_r32g32b32a32_fixed pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)util_iround((CLAMP(pixel.r, 0, 65536) * (1.0/0x10000)) * 0xff); /* r */
         dst[1] = (uint8_t)util_iround((CLAMP(pixel.g, 0, 65536) * (1.0/0x10000)) * 0xff); /* g */
         dst[2] = (uint8_t)util_iround((CLAMP(pixel.b, 0, 65536) * (1.0/0x10000)) * 0xff); /* b */
         dst[3] = (uint8_t)util_iround((CLAMP(pixel.a, 0, 65536) * (1.0/0x10000)) * 0xff); /* a */
#endif
      src += 16;
      dst += 4;
   }
}

void
util_format_r32g32b32a32_fixed_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_fixed pixel = {0};
         pixel.r = (int32_t)((float)(src[0] * (1.0f/0xff)) * (double)0x10000);
         pixel.g = (int32_t)((float)(src[1] * (1.0f/0xff)) * (double)0x10000);
         pixel.b = (int32_t)((float)(src[2] * (1.0f/0xff)) * (double)0x10000);
         pixel.a = (int32_t)((float)(src[3] * (1.0f/0xff)) * (double)0x10000);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32a32_fixed pixel = {0};
         pixel.r = (int32_t)((float)(src[0] * (1.0f/0xff)) * (double)0x10000);
         pixel.g = (int32_t)((float)(src[1] * (1.0f/0xff)) * (double)0x10000);
         pixel.b = (int32_t)((float)(src[2] * (1.0f/0xff)) * (double)0x10000);
         pixel.a = (int32_t)((float)(src[3] * (1.0f/0xff)) * (double)0x10000);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 16;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r10g10b10x2_uscaled_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = (value >> 20) & 0x3ff;
         uint32_t g = (value >> 10) & 0x3ff;
         uint32_t r = (value) & 0x3ff;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = (value) & 0x3ff;
         uint32_t g = (value >> 10) & 0x3ff;
         uint32_t b = (value >> 20) & 0x3ff;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = 1; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_r10g10b10x2_uscaled_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(((uint32_t)CLAMP(src[2], 0.0f, 1023.0f)) & 0x3ff) << 20;
         value |= (uint32_t)(((uint32_t)CLAMP(src[1], 0.0f, 1023.0f)) & 0x3ff) << 10;
         value |= ((uint32_t)CLAMP(src[0], 0.0f, 1023.0f)) & 0x3ff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint32_t)CLAMP(src[0], 0.0f, 1023.0f)) & 0x3ff;
         value |= (uint32_t)(((uint32_t)CLAMP(src[1], 0.0f, 1023.0f)) & 0x3ff) << 10;
         value |= (uint32_t)(((uint32_t)CLAMP(src[2], 0.0f, 1023.0f)) & 0x3ff) << 20;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r10g10b10x2_uscaled_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = (value >> 20) & 0x3ff;
         uint32_t g = (value >> 10) & 0x3ff;
         uint32_t r = (value) & 0x3ff;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = (value) & 0x3ff;
         uint32_t g = (value >> 10) & 0x3ff;
         uint32_t b = (value >> 20) & 0x3ff;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_r10g10b10x2_uscaled_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = (value >> 20) & 0x3ff;
         uint32_t g = (value >> 10) & 0x3ff;
         uint32_t r = (value) & 0x3ff;
         dst[0] = (uint8_t)(((uint32_t)MIN2(r, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)MIN2(g, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)MIN2(b, 1)) * 0xff / 0x1); /* b */
         dst[3] = 255; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = (value) & 0x3ff;
         uint32_t g = (value >> 10) & 0x3ff;
         uint32_t b = (value >> 20) & 0x3ff;
         dst[0] = (uint8_t)(((uint32_t)MIN2(r, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)MIN2(g, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)MIN2(b, 1)) * 0xff / 0x1); /* b */
         dst[3] = 255; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_r10g10b10x2_uscaled_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(((uint32_t)(((uint32_t)src[2]) * 0x1 / 0xff)) & 0x3ff) << 20;
         value |= (uint32_t)(((uint32_t)(((uint32_t)src[1]) * 0x1 / 0xff)) & 0x3ff) << 10;
         value |= ((uint32_t)(((uint32_t)src[0]) * 0x1 / 0xff)) & 0x3ff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint32_t)(((uint32_t)src[0]) * 0x1 / 0xff)) & 0x3ff;
         value |= (uint32_t)(((uint32_t)(((uint32_t)src[1]) * 0x1 / 0xff)) & 0x3ff) << 10;
         value |= (uint32_t)(((uint32_t)(((uint32_t)src[2]) * 0x1 / 0xff)) & 0x3ff) << 20;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r10g10b10x2_snorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t b = ((int32_t)(value << 2) ) >> 22;
         int32_t g = ((int32_t)(value << 12) ) >> 22;
         int32_t r = ((int32_t)(value << 22) ) >> 22;
         dst[0] = (float)(r * (1.0f/0x1ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x1ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x1ff)); /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r = ((int32_t)(value << 22) ) >> 22;
         int32_t g = ((int32_t)(value << 12) ) >> 22;
         int32_t b = ((int32_t)(value << 2) ) >> 22;
         dst[0] = (float)(r * (1.0f/0x1ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x1ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x1ff)); /* b */
         dst[3] = 1; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_r10g10b10x2_snorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)(((uint32_t)util_iround(CLAMP(src[2], -1.0f, 1.0f) * 0x1ff)) & 0x3ff) << 20) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0x1ff)) & 0x3ff) << 10) ;
         value |= (uint32_t)(((uint32_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x1ff)) & 0x3ff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((uint32_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x1ff)) & 0x3ff) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0x1ff)) & 0x3ff) << 10) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)util_iround(CLAMP(src[2], -1.0f, 1.0f) * 0x1ff)) & 0x3ff) << 20) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r10g10b10x2_snorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t b = ((int32_t)(value << 2) ) >> 22;
         int32_t g = ((int32_t)(value << 12) ) >> 22;
         int32_t r = ((int32_t)(value << 22) ) >> 22;
         dst[0] = (float)(r * (1.0f/0x1ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x1ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x1ff)); /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r = ((int32_t)(value << 22) ) >> 22;
         int32_t g = ((int32_t)(value << 12) ) >> 22;
         int32_t b = ((int32_t)(value << 2) ) >> 22;
         dst[0] = (float)(r * (1.0f/0x1ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x1ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x1ff)); /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_r10g10b10x2_snorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t b = ((int32_t)(value << 2) ) >> 22;
         int32_t g = ((int32_t)(value << 12) ) >> 22;
         int32_t r = ((int32_t)(value << 22) ) >> 22;
         dst[0] = _mesa_snorm_to_unorm(MAX2(r, 0), 10, 8); /* r */
         dst[1] = _mesa_snorm_to_unorm(MAX2(g, 0), 10, 8); /* g */
         dst[2] = _mesa_snorm_to_unorm(MAX2(b, 0), 10, 8); /* b */
         dst[3] = 255; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r = ((int32_t)(value << 22) ) >> 22;
         int32_t g = ((int32_t)(value << 12) ) >> 22;
         int32_t b = ((int32_t)(value << 2) ) >> 22;
         dst[0] = _mesa_snorm_to_unorm(MAX2(r, 0), 10, 8); /* r */
         dst[1] = _mesa_snorm_to_unorm(MAX2(g, 0), 10, 8); /* g */
         dst[2] = _mesa_snorm_to_unorm(MAX2(b, 0), 10, 8); /* b */
         dst[3] = 255; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_r10g10b10x2_snorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((_mesa_unorm_to_snorm(src[2], 8, 10)) & 0x3ff) << 20) ;
         value |= (uint32_t)((uint32_t)((_mesa_unorm_to_snorm(src[1], 8, 10)) & 0x3ff) << 10) ;
         value |= (uint32_t)((_mesa_unorm_to_snorm(src[0], 8, 10)) & 0x3ff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)((_mesa_unorm_to_snorm(src[0], 8, 10)) & 0x3ff) ;
         value |= (uint32_t)((uint32_t)((_mesa_unorm_to_snorm(src[1], 8, 10)) & 0x3ff) << 10) ;
         value |= (uint32_t)((uint32_t)((_mesa_unorm_to_snorm(src[2], 8, 10)) & 0x3ff) << 20) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a4r4_unorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint8_t value = *(const uint8_t *)src;
         uint8_t r = value >> 4;
         uint8_t a = (value) & 0xf;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = (float)(a * (1.0f/0xf)); /* a */
#else
         uint8_t value = *(const uint8_t *)src;
         uint8_t a = (value) & 0xf;
         uint8_t r = value >> 4;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = (float)(a * (1.0f/0xf)); /* a */
#endif
      src += 1;
      dst += 4;
   }
}

void
util_format_a4r4_unorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint8_t value = 0;
         value |= (uint32_t)((uint8_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0xf)) << 4;
         value |= ((uint8_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0xf)) & 0xf;
         *(uint8_t *)dst = value;
#else
         uint8_t value = 0;
         value |= ((uint8_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0xf)) & 0xf;
         value |= (uint32_t)((uint8_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0xf)) << 4;
         *(uint8_t *)dst = value;
#endif
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a4r4_unorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint8_t value = *(const uint8_t *)src;
         uint8_t r = value >> 4;
         uint8_t a = (value) & 0xf;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = (float)(a * (1.0f/0xf)); /* a */
#else
         uint8_t value = *(const uint8_t *)src;
         uint8_t a = (value) & 0xf;
         uint8_t r = value >> 4;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = (float)(a * (1.0f/0xf)); /* a */
#endif
}

void
util_format_a4r4_unorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint8_t value = *(const uint8_t *)src;
         uint8_t r = value >> 4;
         uint8_t a = (value) & 0xf;
         dst[0] = _mesa_unorm_to_unorm(r, 4, 8); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = _mesa_unorm_to_unorm(a, 4, 8); /* a */
#else
         uint8_t value = *(const uint8_t *)src;
         uint8_t a = (value) & 0xf;
         uint8_t r = value >> 4;
         dst[0] = _mesa_unorm_to_unorm(r, 4, 8); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = _mesa_unorm_to_unorm(a, 4, 8); /* a */
#endif
      src += 1;
      dst += 4;
   }
}

void
util_format_a4r4_unorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint8_t value = 0;
         value |= (uint32_t)(_mesa_unorm_to_unorm(src[0], 8, 4)) << 4;
         value |= (_mesa_unorm_to_unorm(src[3], 8, 4)) & 0xf;
         *(uint8_t *)dst = value;
#else
         uint8_t value = 0;
         value |= (_mesa_unorm_to_unorm(src[3], 8, 4)) & 0xf;
         value |= (uint32_t)(_mesa_unorm_to_unorm(src[0], 8, 4)) << 4;
         *(uint8_t *)dst = value;
#endif
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r4a4_unorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint8_t value = *(const uint8_t *)src;
         uint8_t a = value >> 4;
         uint8_t r = (value) & 0xf;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = (float)(a * (1.0f/0xf)); /* a */
#else
         uint8_t value = *(const uint8_t *)src;
         uint8_t r = (value) & 0xf;
         uint8_t a = value >> 4;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = (float)(a * (1.0f/0xf)); /* a */
#endif
      src += 1;
      dst += 4;
   }
}

void
util_format_r4a4_unorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint8_t value = 0;
         value |= (uint32_t)((uint8_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0xf)) << 4;
         value |= ((uint8_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0xf)) & 0xf;
         *(uint8_t *)dst = value;
#else
         uint8_t value = 0;
         value |= ((uint8_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0xf)) & 0xf;
         value |= (uint32_t)((uint8_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0xf)) << 4;
         *(uint8_t *)dst = value;
#endif
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r4a4_unorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint8_t value = *(const uint8_t *)src;
         uint8_t a = value >> 4;
         uint8_t r = (value) & 0xf;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = (float)(a * (1.0f/0xf)); /* a */
#else
         uint8_t value = *(const uint8_t *)src;
         uint8_t r = (value) & 0xf;
         uint8_t a = value >> 4;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = (float)(a * (1.0f/0xf)); /* a */
#endif
}

void
util_format_r4a4_unorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint8_t value = *(const uint8_t *)src;
         uint8_t a = value >> 4;
         uint8_t r = (value) & 0xf;
         dst[0] = _mesa_unorm_to_unorm(r, 4, 8); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = _mesa_unorm_to_unorm(a, 4, 8); /* a */
#else
         uint8_t value = *(const uint8_t *)src;
         uint8_t r = (value) & 0xf;
         uint8_t a = value >> 4;
         dst[0] = _mesa_unorm_to_unorm(r, 4, 8); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = _mesa_unorm_to_unorm(a, 4, 8); /* a */
#endif
      src += 1;
      dst += 4;
   }
}

void
util_format_r4a4_unorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint8_t value = 0;
         value |= (uint32_t)(_mesa_unorm_to_unorm(src[3], 8, 4)) << 4;
         value |= (_mesa_unorm_to_unorm(src[0], 8, 4)) & 0xf;
         *(uint8_t *)dst = value;
#else
         uint8_t value = 0;
         value |= (_mesa_unorm_to_unorm(src[0], 8, 4)) & 0xf;
         value |= (uint32_t)(_mesa_unorm_to_unorm(src[3], 8, 4)) << 4;
         *(uint8_t *)dst = value;
#endif
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r8a8_unorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = value >> 8;
         uint16_t a = (value) & 0xff;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = ubyte_to_float(a); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = (value) & 0xff;
         uint16_t a = value >> 8;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = ubyte_to_float(a); /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_r8a8_unorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)(float_to_ubyte(src[0])) << 8;
         value |= (float_to_ubyte(src[3])) & 0xff;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (float_to_ubyte(src[0])) & 0xff;
         value |= (uint32_t)(float_to_ubyte(src[3])) << 8;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r8a8_unorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = value >> 8;
         uint16_t a = (value) & 0xff;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = ubyte_to_float(a); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = (value) & 0xff;
         uint16_t a = value >> 8;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = ubyte_to_float(a); /* a */
#endif
}

void
util_format_r8a8_unorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = value >> 8;
         uint16_t a = (value) & 0xff;
         dst[0] = r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = a; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = (value) & 0xff;
         uint16_t a = value >> 8;
         dst[0] = r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = a; /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_r8a8_unorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)(src[0]) << 8;
         value |= (src[3]) & 0xff;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (src[0]) & 0xff;
         value |= (uint32_t)(src[3]) << 8;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a8r8_unorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t a = value >> 8;
         uint16_t r = (value) & 0xff;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = ubyte_to_float(a); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t a = (value) & 0xff;
         uint16_t r = value >> 8;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = ubyte_to_float(a); /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_a8r8_unorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)(float_to_ubyte(src[3])) << 8;
         value |= (float_to_ubyte(src[0])) & 0xff;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (float_to_ubyte(src[3])) & 0xff;
         value |= (uint32_t)(float_to_ubyte(src[0])) << 8;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a8r8_unorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t a = value >> 8;
         uint16_t r = (value) & 0xff;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = ubyte_to_float(a); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t a = (value) & 0xff;
         uint16_t r = value >> 8;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = ubyte_to_float(a); /* a */
#endif
}

void
util_format_a8r8_unorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t a = value >> 8;
         uint16_t r = (value) & 0xff;
         dst[0] = r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = a; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t a = (value) & 0xff;
         uint16_t r = value >> 8;
         dst[0] = r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = a; /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_a8r8_unorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)(src[3]) << 8;
         value |= (src[0]) & 0xff;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (src[3]) & 0xff;
         value |= (uint32_t)(src[0]) << 8;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r10g10b10a2_uscaled_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = value >> 30;
         uint32_t b = (value >> 20) & 0x3ff;
         uint32_t g = (value >> 10) & 0x3ff;
         uint32_t r = (value) & 0x3ff;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = (value) & 0x3ff;
         uint32_t g = (value >> 10) & 0x3ff;
         uint32_t b = (value >> 20) & 0x3ff;
         uint32_t a = value >> 30;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_r10g10b10a2_uscaled_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)CLAMP(src[3], 0.0f, 3.0f)) << 30;
         value |= (uint32_t)(((uint32_t)CLAMP(src[2], 0.0f, 1023.0f)) & 0x3ff) << 20;
         value |= (uint32_t)(((uint32_t)CLAMP(src[1], 0.0f, 1023.0f)) & 0x3ff) << 10;
         value |= ((uint32_t)CLAMP(src[0], 0.0f, 1023.0f)) & 0x3ff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint32_t)CLAMP(src[0], 0.0f, 1023.0f)) & 0x3ff;
         value |= (uint32_t)(((uint32_t)CLAMP(src[1], 0.0f, 1023.0f)) & 0x3ff) << 10;
         value |= (uint32_t)(((uint32_t)CLAMP(src[2], 0.0f, 1023.0f)) & 0x3ff) << 20;
         value |= (uint32_t)((uint32_t)CLAMP(src[3], 0.0f, 3.0f)) << 30;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r10g10b10a2_uscaled_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = value >> 30;
         uint32_t b = (value >> 20) & 0x3ff;
         uint32_t g = (value >> 10) & 0x3ff;
         uint32_t r = (value) & 0x3ff;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = (value) & 0x3ff;
         uint32_t g = (value >> 10) & 0x3ff;
         uint32_t b = (value >> 20) & 0x3ff;
         uint32_t a = value >> 30;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#endif
}

void
util_format_r10g10b10a2_uscaled_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = value >> 30;
         uint32_t b = (value >> 20) & 0x3ff;
         uint32_t g = (value >> 10) & 0x3ff;
         uint32_t r = (value) & 0x3ff;
         dst[0] = (uint8_t)(((uint32_t)MIN2(r, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)MIN2(g, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)MIN2(b, 1)) * 0xff / 0x1); /* b */
         dst[3] = (uint8_t)(((uint32_t)MIN2(a, 1)) * 0xff / 0x1); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = (value) & 0x3ff;
         uint32_t g = (value >> 10) & 0x3ff;
         uint32_t b = (value >> 20) & 0x3ff;
         uint32_t a = value >> 30;
         dst[0] = (uint8_t)(((uint32_t)MIN2(r, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)MIN2(g, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)MIN2(b, 1)) * 0xff / 0x1); /* b */
         dst[3] = (uint8_t)(((uint32_t)MIN2(a, 1)) * 0xff / 0x1); /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_r10g10b10a2_uscaled_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)(((uint32_t)src[3]) * 0x1 / 0xff)) << 30;
         value |= (uint32_t)(((uint32_t)(((uint32_t)src[2]) * 0x1 / 0xff)) & 0x3ff) << 20;
         value |= (uint32_t)(((uint32_t)(((uint32_t)src[1]) * 0x1 / 0xff)) & 0x3ff) << 10;
         value |= ((uint32_t)(((uint32_t)src[0]) * 0x1 / 0xff)) & 0x3ff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint32_t)(((uint32_t)src[0]) * 0x1 / 0xff)) & 0x3ff;
         value |= (uint32_t)(((uint32_t)(((uint32_t)src[1]) * 0x1 / 0xff)) & 0x3ff) << 10;
         value |= (uint32_t)(((uint32_t)(((uint32_t)src[2]) * 0x1 / 0xff)) & 0x3ff) << 20;
         value |= (uint32_t)((uint32_t)(((uint32_t)src[3]) * 0x1 / 0xff)) << 30;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r10g10b10a2_sscaled_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t a = ((int32_t)(value) ) >> 30;
         int32_t b = ((int32_t)(value << 2) ) >> 22;
         int32_t g = ((int32_t)(value << 12) ) >> 22;
         int32_t r = ((int32_t)(value << 22) ) >> 22;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r = ((int32_t)(value << 22) ) >> 22;
         int32_t g = ((int32_t)(value << 12) ) >> 22;
         int32_t b = ((int32_t)(value << 2) ) >> 22;
         int32_t a = ((int32_t)(value) ) >> 30;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_r10g10b10a2_sscaled_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((uint32_t)CLAMP(src[3], -2.0f, 1.0f)) << 30) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)CLAMP(src[2], -512.0f, 511.0f)) & 0x3ff) << 20) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)CLAMP(src[1], -512.0f, 511.0f)) & 0x3ff) << 10) ;
         value |= (uint32_t)(((uint32_t)CLAMP(src[0], -512.0f, 511.0f)) & 0x3ff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((uint32_t)CLAMP(src[0], -512.0f, 511.0f)) & 0x3ff) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)CLAMP(src[1], -512.0f, 511.0f)) & 0x3ff) << 10) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)CLAMP(src[2], -512.0f, 511.0f)) & 0x3ff) << 20) ;
         value |= (uint32_t)((uint32_t)((uint32_t)CLAMP(src[3], -2.0f, 1.0f)) << 30) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r10g10b10a2_sscaled_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t a = ((int32_t)(value) ) >> 30;
         int32_t b = ((int32_t)(value << 2) ) >> 22;
         int32_t g = ((int32_t)(value << 12) ) >> 22;
         int32_t r = ((int32_t)(value << 22) ) >> 22;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r = ((int32_t)(value << 22) ) >> 22;
         int32_t g = ((int32_t)(value << 12) ) >> 22;
         int32_t b = ((int32_t)(value << 2) ) >> 22;
         int32_t a = ((int32_t)(value) ) >> 30;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#endif
}

void
util_format_r10g10b10a2_sscaled_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t a = ((int32_t)(value) ) >> 30;
         int32_t b = ((int32_t)(value << 2) ) >> 22;
         int32_t g = ((int32_t)(value << 12) ) >> 22;
         int32_t r = ((int32_t)(value << 22) ) >> 22;
         dst[0] = (uint8_t)(((uint32_t)CLAMP(r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)CLAMP(g, 0, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)CLAMP(b, 0, 1)) * 0xff / 0x1); /* b */
         dst[3] = (uint8_t)(((uint32_t)MAX2(a, 0)) * 0xff / 0x1); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r = ((int32_t)(value << 22) ) >> 22;
         int32_t g = ((int32_t)(value << 12) ) >> 22;
         int32_t b = ((int32_t)(value << 2) ) >> 22;
         int32_t a = ((int32_t)(value) ) >> 30;
         dst[0] = (uint8_t)(((uint32_t)CLAMP(r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)CLAMP(g, 0, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)CLAMP(b, 0, 1)) * 0xff / 0x1); /* b */
         dst[3] = (uint8_t)(((uint32_t)MAX2(a, 0)) * 0xff / 0x1); /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_r10g10b10a2_sscaled_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((uint32_t)(((uint32_t)src[3]) * 0x1 / 0xff)) << 30) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)(((uint32_t)src[2]) * 0x1 / 0xff)) & 0x3ff) << 20) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)(((uint32_t)src[1]) * 0x1 / 0xff)) & 0x3ff) << 10) ;
         value |= (uint32_t)(((uint32_t)(((uint32_t)src[0]) * 0x1 / 0xff)) & 0x3ff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((uint32_t)(((uint32_t)src[0]) * 0x1 / 0xff)) & 0x3ff) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)(((uint32_t)src[1]) * 0x1 / 0xff)) & 0x3ff) << 10) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)(((uint32_t)src[2]) * 0x1 / 0xff)) & 0x3ff) << 20) ;
         value |= (uint32_t)((uint32_t)((uint32_t)(((uint32_t)src[3]) * 0x1 / 0xff)) << 30) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r10g10b10a2_snorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t a = ((int32_t)(value) ) >> 30;
         int32_t b = ((int32_t)(value << 2) ) >> 22;
         int32_t g = ((int32_t)(value << 12) ) >> 22;
         int32_t r = ((int32_t)(value << 22) ) >> 22;
         dst[0] = (float)(r * (1.0f/0x1ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x1ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x1ff)); /* b */
         dst[3] = (float)(a * (1.0f/0x1)); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r = ((int32_t)(value << 22) ) >> 22;
         int32_t g = ((int32_t)(value << 12) ) >> 22;
         int32_t b = ((int32_t)(value << 2) ) >> 22;
         int32_t a = ((int32_t)(value) ) >> 30;
         dst[0] = (float)(r * (1.0f/0x1ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x1ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x1ff)); /* b */
         dst[3] = (float)(a * (1.0f/0x1)); /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_r10g10b10a2_snorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((uint32_t)util_iround(CLAMP(src[3], -1.0f, 1.0f) * 0x1)) << 30) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)util_iround(CLAMP(src[2], -1.0f, 1.0f) * 0x1ff)) & 0x3ff) << 20) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0x1ff)) & 0x3ff) << 10) ;
         value |= (uint32_t)(((uint32_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x1ff)) & 0x3ff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((uint32_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x1ff)) & 0x3ff) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0x1ff)) & 0x3ff) << 10) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)util_iround(CLAMP(src[2], -1.0f, 1.0f) * 0x1ff)) & 0x3ff) << 20) ;
         value |= (uint32_t)((uint32_t)((uint32_t)util_iround(CLAMP(src[3], -1.0f, 1.0f) * 0x1)) << 30) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r10g10b10a2_snorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t a = ((int32_t)(value) ) >> 30;
         int32_t b = ((int32_t)(value << 2) ) >> 22;
         int32_t g = ((int32_t)(value << 12) ) >> 22;
         int32_t r = ((int32_t)(value << 22) ) >> 22;
         dst[0] = (float)(r * (1.0f/0x1ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x1ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x1ff)); /* b */
         dst[3] = (float)(a * (1.0f/0x1)); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r = ((int32_t)(value << 22) ) >> 22;
         int32_t g = ((int32_t)(value << 12) ) >> 22;
         int32_t b = ((int32_t)(value << 2) ) >> 22;
         int32_t a = ((int32_t)(value) ) >> 30;
         dst[0] = (float)(r * (1.0f/0x1ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x1ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x1ff)); /* b */
         dst[3] = (float)(a * (1.0f/0x1)); /* a */
#endif
}

void
util_format_r10g10b10a2_snorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t a = ((int32_t)(value) ) >> 30;
         int32_t b = ((int32_t)(value << 2) ) >> 22;
         int32_t g = ((int32_t)(value << 12) ) >> 22;
         int32_t r = ((int32_t)(value << 22) ) >> 22;
         dst[0] = _mesa_snorm_to_unorm(MAX2(r, 0), 10, 8); /* r */
         dst[1] = _mesa_snorm_to_unorm(MAX2(g, 0), 10, 8); /* g */
         dst[2] = _mesa_snorm_to_unorm(MAX2(b, 0), 10, 8); /* b */
         dst[3] = _mesa_snorm_to_unorm(MAX2(a, 0), 2, 8); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r = ((int32_t)(value << 22) ) >> 22;
         int32_t g = ((int32_t)(value << 12) ) >> 22;
         int32_t b = ((int32_t)(value << 2) ) >> 22;
         int32_t a = ((int32_t)(value) ) >> 30;
         dst[0] = _mesa_snorm_to_unorm(MAX2(r, 0), 10, 8); /* r */
         dst[1] = _mesa_snorm_to_unorm(MAX2(g, 0), 10, 8); /* g */
         dst[2] = _mesa_snorm_to_unorm(MAX2(b, 0), 10, 8); /* b */
         dst[3] = _mesa_snorm_to_unorm(MAX2(a, 0), 2, 8); /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_r10g10b10a2_snorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)(_mesa_unorm_to_snorm(src[3], 8, 2)) << 30) ;
         value |= (uint32_t)((uint32_t)((_mesa_unorm_to_snorm(src[2], 8, 10)) & 0x3ff) << 20) ;
         value |= (uint32_t)((uint32_t)((_mesa_unorm_to_snorm(src[1], 8, 10)) & 0x3ff) << 10) ;
         value |= (uint32_t)((_mesa_unorm_to_snorm(src[0], 8, 10)) & 0x3ff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)((_mesa_unorm_to_snorm(src[0], 8, 10)) & 0x3ff) ;
         value |= (uint32_t)((uint32_t)((_mesa_unorm_to_snorm(src[1], 8, 10)) & 0x3ff) << 10) ;
         value |= (uint32_t)((uint32_t)((_mesa_unorm_to_snorm(src[2], 8, 10)) & 0x3ff) << 20) ;
         value |= (uint32_t)((uint32_t)(_mesa_unorm_to_snorm(src[3], 8, 2)) << 30) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b10g10r10a2_uscaled_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = value >> 30;
         uint32_t r = (value >> 20) & 0x3ff;
         uint32_t g = (value >> 10) & 0x3ff;
         uint32_t b = (value) & 0x3ff;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = (value) & 0x3ff;
         uint32_t g = (value >> 10) & 0x3ff;
         uint32_t r = (value >> 20) & 0x3ff;
         uint32_t a = value >> 30;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_b10g10r10a2_uscaled_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)CLAMP(src[3], 0.0f, 3.0f)) << 30;
         value |= (uint32_t)(((uint32_t)CLAMP(src[0], 0.0f, 1023.0f)) & 0x3ff) << 20;
         value |= (uint32_t)(((uint32_t)CLAMP(src[1], 0.0f, 1023.0f)) & 0x3ff) << 10;
         value |= ((uint32_t)CLAMP(src[2], 0.0f, 1023.0f)) & 0x3ff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint32_t)CLAMP(src[2], 0.0f, 1023.0f)) & 0x3ff;
         value |= (uint32_t)(((uint32_t)CLAMP(src[1], 0.0f, 1023.0f)) & 0x3ff) << 10;
         value |= (uint32_t)(((uint32_t)CLAMP(src[0], 0.0f, 1023.0f)) & 0x3ff) << 20;
         value |= (uint32_t)((uint32_t)CLAMP(src[3], 0.0f, 3.0f)) << 30;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b10g10r10a2_uscaled_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = value >> 30;
         uint32_t r = (value >> 20) & 0x3ff;
         uint32_t g = (value >> 10) & 0x3ff;
         uint32_t b = (value) & 0x3ff;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = (value) & 0x3ff;
         uint32_t g = (value >> 10) & 0x3ff;
         uint32_t r = (value >> 20) & 0x3ff;
         uint32_t a = value >> 30;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#endif
}

void
util_format_b10g10r10a2_uscaled_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = value >> 30;
         uint32_t r = (value >> 20) & 0x3ff;
         uint32_t g = (value >> 10) & 0x3ff;
         uint32_t b = (value) & 0x3ff;
         dst[0] = (uint8_t)(((uint32_t)MIN2(r, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)MIN2(g, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)MIN2(b, 1)) * 0xff / 0x1); /* b */
         dst[3] = (uint8_t)(((uint32_t)MIN2(a, 1)) * 0xff / 0x1); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = (value) & 0x3ff;
         uint32_t g = (value >> 10) & 0x3ff;
         uint32_t r = (value >> 20) & 0x3ff;
         uint32_t a = value >> 30;
         dst[0] = (uint8_t)(((uint32_t)MIN2(r, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)MIN2(g, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)MIN2(b, 1)) * 0xff / 0x1); /* b */
         dst[3] = (uint8_t)(((uint32_t)MIN2(a, 1)) * 0xff / 0x1); /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_b10g10r10a2_uscaled_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)(((uint32_t)src[3]) * 0x1 / 0xff)) << 30;
         value |= (uint32_t)(((uint32_t)(((uint32_t)src[0]) * 0x1 / 0xff)) & 0x3ff) << 20;
         value |= (uint32_t)(((uint32_t)(((uint32_t)src[1]) * 0x1 / 0xff)) & 0x3ff) << 10;
         value |= ((uint32_t)(((uint32_t)src[2]) * 0x1 / 0xff)) & 0x3ff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint32_t)(((uint32_t)src[2]) * 0x1 / 0xff)) & 0x3ff;
         value |= (uint32_t)(((uint32_t)(((uint32_t)src[1]) * 0x1 / 0xff)) & 0x3ff) << 10;
         value |= (uint32_t)(((uint32_t)(((uint32_t)src[0]) * 0x1 / 0xff)) & 0x3ff) << 20;
         value |= (uint32_t)((uint32_t)(((uint32_t)src[3]) * 0x1 / 0xff)) << 30;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b10g10r10a2_sscaled_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t a = ((int32_t)(value) ) >> 30;
         int32_t r = ((int32_t)(value << 2) ) >> 22;
         int32_t g = ((int32_t)(value << 12) ) >> 22;
         int32_t b = ((int32_t)(value << 22) ) >> 22;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t b = ((int32_t)(value << 22) ) >> 22;
         int32_t g = ((int32_t)(value << 12) ) >> 22;
         int32_t r = ((int32_t)(value << 2) ) >> 22;
         int32_t a = ((int32_t)(value) ) >> 30;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_b10g10r10a2_sscaled_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((uint32_t)CLAMP(src[3], -2.0f, 1.0f)) << 30) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)CLAMP(src[0], -512.0f, 511.0f)) & 0x3ff) << 20) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)CLAMP(src[1], -512.0f, 511.0f)) & 0x3ff) << 10) ;
         value |= (uint32_t)(((uint32_t)CLAMP(src[2], -512.0f, 511.0f)) & 0x3ff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((uint32_t)CLAMP(src[2], -512.0f, 511.0f)) & 0x3ff) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)CLAMP(src[1], -512.0f, 511.0f)) & 0x3ff) << 10) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)CLAMP(src[0], -512.0f, 511.0f)) & 0x3ff) << 20) ;
         value |= (uint32_t)((uint32_t)((uint32_t)CLAMP(src[3], -2.0f, 1.0f)) << 30) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b10g10r10a2_sscaled_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t a = ((int32_t)(value) ) >> 30;
         int32_t r = ((int32_t)(value << 2) ) >> 22;
         int32_t g = ((int32_t)(value << 12) ) >> 22;
         int32_t b = ((int32_t)(value << 22) ) >> 22;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t b = ((int32_t)(value << 22) ) >> 22;
         int32_t g = ((int32_t)(value << 12) ) >> 22;
         int32_t r = ((int32_t)(value << 2) ) >> 22;
         int32_t a = ((int32_t)(value) ) >> 30;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#endif
}

void
util_format_b10g10r10a2_sscaled_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t a = ((int32_t)(value) ) >> 30;
         int32_t r = ((int32_t)(value << 2) ) >> 22;
         int32_t g = ((int32_t)(value << 12) ) >> 22;
         int32_t b = ((int32_t)(value << 22) ) >> 22;
         dst[0] = (uint8_t)(((uint32_t)CLAMP(r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)CLAMP(g, 0, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)CLAMP(b, 0, 1)) * 0xff / 0x1); /* b */
         dst[3] = (uint8_t)(((uint32_t)MAX2(a, 0)) * 0xff / 0x1); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t b = ((int32_t)(value << 22) ) >> 22;
         int32_t g = ((int32_t)(value << 12) ) >> 22;
         int32_t r = ((int32_t)(value << 2) ) >> 22;
         int32_t a = ((int32_t)(value) ) >> 30;
         dst[0] = (uint8_t)(((uint32_t)CLAMP(r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)CLAMP(g, 0, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)CLAMP(b, 0, 1)) * 0xff / 0x1); /* b */
         dst[3] = (uint8_t)(((uint32_t)MAX2(a, 0)) * 0xff / 0x1); /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_b10g10r10a2_sscaled_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((uint32_t)(((uint32_t)src[3]) * 0x1 / 0xff)) << 30) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)(((uint32_t)src[0]) * 0x1 / 0xff)) & 0x3ff) << 20) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)(((uint32_t)src[1]) * 0x1 / 0xff)) & 0x3ff) << 10) ;
         value |= (uint32_t)(((uint32_t)(((uint32_t)src[2]) * 0x1 / 0xff)) & 0x3ff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((uint32_t)(((uint32_t)src[2]) * 0x1 / 0xff)) & 0x3ff) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)(((uint32_t)src[1]) * 0x1 / 0xff)) & 0x3ff) << 10) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)(((uint32_t)src[0]) * 0x1 / 0xff)) & 0x3ff) << 20) ;
         value |= (uint32_t)((uint32_t)((uint32_t)(((uint32_t)src[3]) * 0x1 / 0xff)) << 30) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b10g10r10a2_snorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t a = ((int32_t)(value) ) >> 30;
         int32_t r = ((int32_t)(value << 2) ) >> 22;
         int32_t g = ((int32_t)(value << 12) ) >> 22;
         int32_t b = ((int32_t)(value << 22) ) >> 22;
         dst[0] = (float)(r * (1.0f/0x1ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x1ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x1ff)); /* b */
         dst[3] = (float)(a * (1.0f/0x1)); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t b = ((int32_t)(value << 22) ) >> 22;
         int32_t g = ((int32_t)(value << 12) ) >> 22;
         int32_t r = ((int32_t)(value << 2) ) >> 22;
         int32_t a = ((int32_t)(value) ) >> 30;
         dst[0] = (float)(r * (1.0f/0x1ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x1ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x1ff)); /* b */
         dst[3] = (float)(a * (1.0f/0x1)); /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_b10g10r10a2_snorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((uint32_t)util_iround(CLAMP(src[3], -1.0f, 1.0f) * 0x1)) << 30) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x1ff)) & 0x3ff) << 20) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0x1ff)) & 0x3ff) << 10) ;
         value |= (uint32_t)(((uint32_t)util_iround(CLAMP(src[2], -1.0f, 1.0f) * 0x1ff)) & 0x3ff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((uint32_t)util_iround(CLAMP(src[2], -1.0f, 1.0f) * 0x1ff)) & 0x3ff) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0x1ff)) & 0x3ff) << 10) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x1ff)) & 0x3ff) << 20) ;
         value |= (uint32_t)((uint32_t)((uint32_t)util_iround(CLAMP(src[3], -1.0f, 1.0f) * 0x1)) << 30) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b10g10r10a2_snorm_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   float *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t a = ((int32_t)(value) ) >> 30;
         int32_t r = ((int32_t)(value << 2) ) >> 22;
         int32_t g = ((int32_t)(value << 12) ) >> 22;
         int32_t b = ((int32_t)(value << 22) ) >> 22;
         dst[0] = (float)(r * (1.0f/0x1ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x1ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x1ff)); /* b */
         dst[3] = (float)(a * (1.0f/0x1)); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t b = ((int32_t)(value << 22) ) >> 22;
         int32_t g = ((int32_t)(value << 12) ) >> 22;
         int32_t r = ((int32_t)(value << 2) ) >> 22;
         int32_t a = ((int32_t)(value) ) >> 30;
         dst[0] = (float)(r * (1.0f/0x1ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x1ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x1ff)); /* b */
         dst[3] = (float)(a * (1.0f/0x1)); /* a */
#endif
}

void
util_format_b10g10r10a2_snorm_unpack_rgba_8unorm(uint8_t *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   uint8_t *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t a = ((int32_t)(value) ) >> 30;
         int32_t r = ((int32_t)(value << 2) ) >> 22;
         int32_t g = ((int32_t)(value << 12) ) >> 22;
         int32_t b = ((int32_t)(value << 22) ) >> 22;
         dst[0] = _mesa_snorm_to_unorm(MAX2(r, 0), 10, 8); /* r */
         dst[1] = _mesa_snorm_to_unorm(MAX2(g, 0), 10, 8); /* g */
         dst[2] = _mesa_snorm_to_unorm(MAX2(b, 0), 10, 8); /* b */
         dst[3] = _mesa_snorm_to_unorm(MAX2(a, 0), 2, 8); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t b = ((int32_t)(value << 22) ) >> 22;
         int32_t g = ((int32_t)(value << 12) ) >> 22;
         int32_t r = ((int32_t)(value << 2) ) >> 22;
         int32_t a = ((int32_t)(value) ) >> 30;
         dst[0] = _mesa_snorm_to_unorm(MAX2(r, 0), 10, 8); /* r */
         dst[1] = _mesa_snorm_to_unorm(MAX2(g, 0), 10, 8); /* g */
         dst[2] = _mesa_snorm_to_unorm(MAX2(b, 0), 10, 8); /* b */
         dst[3] = _mesa_snorm_to_unorm(MAX2(a, 0), 2, 8); /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_b10g10r10a2_snorm_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride, const uint8_t *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)(_mesa_unorm_to_snorm(src[3], 8, 2)) << 30) ;
         value |= (uint32_t)((uint32_t)((_mesa_unorm_to_snorm(src[0], 8, 10)) & 0x3ff) << 20) ;
         value |= (uint32_t)((uint32_t)((_mesa_unorm_to_snorm(src[1], 8, 10)) & 0x3ff) << 10) ;
         value |= (uint32_t)((_mesa_unorm_to_snorm(src[2], 8, 10)) & 0x3ff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)((_mesa_unorm_to_snorm(src[2], 8, 10)) & 0x3ff) ;
         value |= (uint32_t)((uint32_t)((_mesa_unorm_to_snorm(src[1], 8, 10)) & 0x3ff) << 10) ;
         value |= (uint32_t)((uint32_t)((_mesa_unorm_to_snorm(src[0], 8, 10)) & 0x3ff) << 20) ;
         value |= (uint32_t)((uint32_t)(_mesa_unorm_to_snorm(src[3], 8, 2)) << 30) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r8_uint_unpack_unsigned(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   unsigned *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         uint8_t r = value;
         dst[0] = (unsigned)r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
      src += 1;
      dst += 4;
   }
}

void
util_format_r8_uint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= (uint8_t)MIN2(src[0], 255);
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r8_uint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   unsigned *dst = in_dst;
         uint8_t value = *(const uint8_t *)src;
         uint8_t r = value;
         dst[0] = (unsigned)r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
}

void
util_format_r8_uint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= (uint8_t)CLAMP(src[0], 0, 255);
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r8g8_uint_unpack_unsigned(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   unsigned *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = value >> 8;
         uint16_t g = (value) & 0xff;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = (value) & 0xff;
         uint16_t g = value >> 8;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_r8g8_uint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint8_t)MIN2(src[0], 255)) << 8;
         value |= ((uint8_t)MIN2(src[1], 255)) & 0xff;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint8_t)MIN2(src[0], 255)) & 0xff;
         value |= (uint32_t)((uint8_t)MIN2(src[1], 255)) << 8;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r8g8_uint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   unsigned *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = value >> 8;
         uint16_t g = (value) & 0xff;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = (value) & 0xff;
         uint16_t g = value >> 8;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_r8g8_uint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint8_t)CLAMP(src[0], 0, 255)) << 8;
         value |= ((uint8_t)CLAMP(src[1], 0, 255)) & 0xff;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint8_t)CLAMP(src[0], 0, 255)) & 0xff;
         value |= (uint32_t)((uint8_t)CLAMP(src[1], 0, 255)) << 8;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r8g8b8_uint {
#if UTIL_ARCH_BIG_ENDIAN
   uint8_t r;
   uint8_t g;
   uint8_t b;
#else
   uint8_t r;
   uint8_t g;
   uint8_t b;
#endif
};

void
util_format_r8g8b8_uint_unpack_unsigned(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   unsigned *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)pixel.r; /* r */
         dst[1] = (unsigned)pixel.g; /* g */
         dst[2] = (unsigned)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r8g8b8_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)pixel.r; /* r */
         dst[1] = (unsigned)pixel.g; /* g */
         dst[2] = (unsigned)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
      src += 3;
      dst += 4;
   }
}

void
util_format_r8g8b8_uint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_uint pixel = {0};
         pixel.r = (uint8_t)MIN2(src[0], 255);
         pixel.g = (uint8_t)MIN2(src[1], 255);
         pixel.b = (uint8_t)MIN2(src[2], 255);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r8g8b8_uint pixel = {0};
         pixel.r = (uint8_t)MIN2(src[0], 255);
         pixel.g = (uint8_t)MIN2(src[1], 255);
         pixel.b = (uint8_t)MIN2(src[2], 255);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 3;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r8g8b8_uint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   unsigned *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)pixel.r; /* r */
         dst[1] = (unsigned)pixel.g; /* g */
         dst[2] = (unsigned)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r8g8b8_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)pixel.r; /* r */
         dst[1] = (unsigned)pixel.g; /* g */
         dst[2] = (unsigned)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_r8g8b8_uint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_uint pixel = {0};
         pixel.r = (uint8_t)CLAMP(src[0], 0, 255);
         pixel.g = (uint8_t)CLAMP(src[1], 0, 255);
         pixel.b = (uint8_t)CLAMP(src[2], 0, 255);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r8g8b8_uint pixel = {0};
         pixel.r = (uint8_t)CLAMP(src[0], 0, 255);
         pixel.g = (uint8_t)CLAMP(src[1], 0, 255);
         pixel.b = (uint8_t)CLAMP(src[2], 0, 255);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 3;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r8g8b8a8_uint_unpack_unsigned(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   unsigned *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = value >> 24;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t b = (value >> 8) & 0xff;
         uint32_t a = (value) & 0xff;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = (value) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t b = (value >> 16) & 0xff;
         uint32_t a = value >> 24;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_r8g8b8a8_uint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint8_t)MIN2(src[0], 255)) << 24;
         value |= (uint32_t)(((uint8_t)MIN2(src[1], 255)) & 0xff) << 16;
         value |= (uint32_t)(((uint8_t)MIN2(src[2], 255)) & 0xff) << 8;
         value |= ((uint8_t)MIN2(src[3], 255)) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint8_t)MIN2(src[0], 255)) & 0xff;
         value |= (uint32_t)(((uint8_t)MIN2(src[1], 255)) & 0xff) << 8;
         value |= (uint32_t)(((uint8_t)MIN2(src[2], 255)) & 0xff) << 16;
         value |= (uint32_t)((uint8_t)MIN2(src[3], 255)) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r8g8b8a8_uint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   unsigned *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = value >> 24;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t b = (value >> 8) & 0xff;
         uint32_t a = (value) & 0xff;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = (value) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t b = (value >> 16) & 0xff;
         uint32_t a = value >> 24;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#endif
}

void
util_format_r8g8b8a8_uint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint8_t)CLAMP(src[0], 0, 255)) << 24;
         value |= (uint32_t)(((uint8_t)CLAMP(src[1], 0, 255)) & 0xff) << 16;
         value |= (uint32_t)(((uint8_t)CLAMP(src[2], 0, 255)) & 0xff) << 8;
         value |= ((uint8_t)CLAMP(src[3], 0, 255)) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint8_t)CLAMP(src[0], 0, 255)) & 0xff;
         value |= (uint32_t)(((uint8_t)CLAMP(src[1], 0, 255)) & 0xff) << 8;
         value |= (uint32_t)(((uint8_t)CLAMP(src[2], 0, 255)) & 0xff) << 16;
         value |= (uint32_t)((uint8_t)CLAMP(src[3], 0, 255)) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r8_sint_unpack_signed(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   int *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         int8_t r = (int8_t)(value) ;
         dst[0] = (int)r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
      src += 1;
      dst += 4;
   }
}

void
util_format_r8_sint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= (uint8_t)((int8_t)CLAMP(src[0], -128, 127)) ;
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r8_sint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   int *dst = in_dst;
         uint8_t value = *(const uint8_t *)src;
         int8_t r = (int8_t)(value) ;
         dst[0] = (int)r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
}

void
util_format_r8_sint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= (uint8_t)((int8_t)MIN2(src[0], 127)) ;
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r8g8_sint_unpack_signed(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   int *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         int16_t r = ((int16_t)(value) ) >> 8;
         int16_t g = ((int16_t)(value << 8) ) >> 8;
         dst[0] = (int)r; /* r */
         dst[1] = (int)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         int16_t r = ((int16_t)(value << 8) ) >> 8;
         int16_t g = ((int16_t)(value) ) >> 8;
         dst[0] = (int)r; /* r */
         dst[1] = (int)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_r8g8_sint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint16_t)((uint32_t)((int8_t)CLAMP(src[0], -128, 127)) << 8) ;
         value |= (uint16_t)(((int8_t)CLAMP(src[1], -128, 127)) & 0xff) ;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (uint16_t)(((int8_t)CLAMP(src[0], -128, 127)) & 0xff) ;
         value |= (uint16_t)((uint32_t)((int8_t)CLAMP(src[1], -128, 127)) << 8) ;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r8g8_sint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   int *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         int16_t r = ((int16_t)(value) ) >> 8;
         int16_t g = ((int16_t)(value << 8) ) >> 8;
         dst[0] = (int)r; /* r */
         dst[1] = (int)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         int16_t r = ((int16_t)(value << 8) ) >> 8;
         int16_t g = ((int16_t)(value) ) >> 8;
         dst[0] = (int)r; /* r */
         dst[1] = (int)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_r8g8_sint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint16_t)((uint32_t)((int8_t)MIN2(src[0], 127)) << 8) ;
         value |= (uint16_t)(((int8_t)MIN2(src[1], 127)) & 0xff) ;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (uint16_t)(((int8_t)MIN2(src[0], 127)) & 0xff) ;
         value |= (uint16_t)((uint32_t)((int8_t)MIN2(src[1], 127)) << 8) ;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r8g8b8_sint {
#if UTIL_ARCH_BIG_ENDIAN
   int8_t r;
   int8_t g;
   int8_t b;
#else
   int8_t r;
   int8_t g;
   int8_t b;
#endif
};

void
util_format_r8g8b8_sint_unpack_signed(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   int *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (int)pixel.r; /* r */
         dst[1] = (int)pixel.g; /* g */
         dst[2] = (int)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r8g8b8_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (int)pixel.r; /* r */
         dst[1] = (int)pixel.g; /* g */
         dst[2] = (int)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
      src += 3;
      dst += 4;
   }
}

void
util_format_r8g8b8_sint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_sint pixel = {0};
         pixel.r = (int8_t)CLAMP(src[0], -128, 127);
         pixel.g = (int8_t)CLAMP(src[1], -128, 127);
         pixel.b = (int8_t)CLAMP(src[2], -128, 127);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r8g8b8_sint pixel = {0};
         pixel.r = (int8_t)CLAMP(src[0], -128, 127);
         pixel.g = (int8_t)CLAMP(src[1], -128, 127);
         pixel.b = (int8_t)CLAMP(src[2], -128, 127);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 3;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r8g8b8_sint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   int *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (int)pixel.r; /* r */
         dst[1] = (int)pixel.g; /* g */
         dst[2] = (int)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r8g8b8_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (int)pixel.r; /* r */
         dst[1] = (int)pixel.g; /* g */
         dst[2] = (int)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_r8g8b8_sint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_sint pixel = {0};
         pixel.r = (int8_t)MIN2(src[0], 127);
         pixel.g = (int8_t)MIN2(src[1], 127);
         pixel.b = (int8_t)MIN2(src[2], 127);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r8g8b8_sint pixel = {0};
         pixel.r = (int8_t)MIN2(src[0], 127);
         pixel.g = (int8_t)MIN2(src[1], 127);
         pixel.b = (int8_t)MIN2(src[2], 127);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 3;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r8g8b8a8_sint_unpack_signed(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   int *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t r = ((int32_t)(value) ) >> 24;
         int32_t g = ((int32_t)(value << 8) ) >> 24;
         int32_t b = ((int32_t)(value << 16) ) >> 24;
         int32_t a = ((int32_t)(value << 24) ) >> 24;
         dst[0] = (int)r; /* r */
         dst[1] = (int)g; /* g */
         dst[2] = (int)b; /* b */
         dst[3] = (int)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r = ((int32_t)(value << 24) ) >> 24;
         int32_t g = ((int32_t)(value << 16) ) >> 24;
         int32_t b = ((int32_t)(value << 8) ) >> 24;
         int32_t a = ((int32_t)(value) ) >> 24;
         dst[0] = (int)r; /* r */
         dst[1] = (int)g; /* g */
         dst[2] = (int)b; /* b */
         dst[3] = (int)a; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_r8g8b8a8_sint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((int8_t)CLAMP(src[0], -128, 127)) << 24) ;
         value |= (uint32_t)((uint32_t)(((int8_t)CLAMP(src[1], -128, 127)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)(((int8_t)CLAMP(src[2], -128, 127)) & 0xff) << 8) ;
         value |= (uint32_t)(((int8_t)CLAMP(src[3], -128, 127)) & 0xff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((int8_t)CLAMP(src[0], -128, 127)) & 0xff) ;
         value |= (uint32_t)((uint32_t)(((int8_t)CLAMP(src[1], -128, 127)) & 0xff) << 8) ;
         value |= (uint32_t)((uint32_t)(((int8_t)CLAMP(src[2], -128, 127)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)((int8_t)CLAMP(src[3], -128, 127)) << 24) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r8g8b8a8_sint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   int *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t r = ((int32_t)(value) ) >> 24;
         int32_t g = ((int32_t)(value << 8) ) >> 24;
         int32_t b = ((int32_t)(value << 16) ) >> 24;
         int32_t a = ((int32_t)(value << 24) ) >> 24;
         dst[0] = (int)r; /* r */
         dst[1] = (int)g; /* g */
         dst[2] = (int)b; /* b */
         dst[3] = (int)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r = ((int32_t)(value << 24) ) >> 24;
         int32_t g = ((int32_t)(value << 16) ) >> 24;
         int32_t b = ((int32_t)(value << 8) ) >> 24;
         int32_t a = ((int32_t)(value) ) >> 24;
         dst[0] = (int)r; /* r */
         dst[1] = (int)g; /* g */
         dst[2] = (int)b; /* b */
         dst[3] = (int)a; /* a */
#endif
}

void
util_format_r8g8b8a8_sint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((int8_t)MIN2(src[0], 127)) << 24) ;
         value |= (uint32_t)((uint32_t)(((int8_t)MIN2(src[1], 127)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)(((int8_t)MIN2(src[2], 127)) & 0xff) << 8) ;
         value |= (uint32_t)(((int8_t)MIN2(src[3], 127)) & 0xff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((int8_t)MIN2(src[0], 127)) & 0xff) ;
         value |= (uint32_t)((uint32_t)(((int8_t)MIN2(src[1], 127)) & 0xff) << 8) ;
         value |= (uint32_t)((uint32_t)(((int8_t)MIN2(src[2], 127)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)((int8_t)MIN2(src[3], 127)) << 24) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r16_uint_unpack_unsigned(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   unsigned *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = value;
         dst[0] = (unsigned)r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
      src += 2;
      dst += 4;
   }
}

void
util_format_r16_uint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)MIN2(src[0], 65535);
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r16_uint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   unsigned *dst = in_dst;
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = value;
         dst[0] = (unsigned)r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
}

void
util_format_r16_uint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)CLAMP(src[0], 0, 65535);
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r16g16_uint_unpack_unsigned(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   unsigned *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = value >> 16;
         uint32_t g = (value) & 0xffff;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = (value) & 0xffff;
         uint32_t g = value >> 16;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_r16g16_uint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint16_t)MIN2(src[0], 65535)) << 16;
         value |= ((uint16_t)MIN2(src[1], 65535)) & 0xffff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint16_t)MIN2(src[0], 65535)) & 0xffff;
         value |= (uint32_t)((uint16_t)MIN2(src[1], 65535)) << 16;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r16g16_uint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   unsigned *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = value >> 16;
         uint32_t g = (value) & 0xffff;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = (value) & 0xffff;
         uint32_t g = value >> 16;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_r16g16_uint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint16_t)CLAMP(src[0], 0, 65535)) << 16;
         value |= ((uint16_t)CLAMP(src[1], 0, 65535)) & 0xffff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint16_t)CLAMP(src[0], 0, 65535)) & 0xffff;
         value |= (uint32_t)((uint16_t)CLAMP(src[1], 0, 65535)) << 16;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r16g16b16_uint {
#if UTIL_ARCH_BIG_ENDIAN
   uint16_t r;
   uint16_t g;
   uint16_t b;
#else
   uint16_t r;
   uint16_t g;
   uint16_t b;
#endif
};

void
util_format_r16g16b16_uint_unpack_unsigned(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   unsigned *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)pixel.r; /* r */
         dst[1] = (unsigned)pixel.g; /* g */
         dst[2] = (unsigned)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r16g16b16_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)pixel.r; /* r */
         dst[1] = (unsigned)pixel.g; /* g */
         dst[2] = (unsigned)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
      src += 6;
      dst += 4;
   }
}

void
util_format_r16g16b16_uint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_uint pixel = {0};
         pixel.r = (uint16_t)MIN2(src[0], 65535);
         pixel.g = (uint16_t)MIN2(src[1], 65535);
         pixel.b = (uint16_t)MIN2(src[2], 65535);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r16g16b16_uint pixel = {0};
         pixel.r = (uint16_t)MIN2(src[0], 65535);
         pixel.g = (uint16_t)MIN2(src[1], 65535);
         pixel.b = (uint16_t)MIN2(src[2], 65535);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 6;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r16g16b16_uint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   unsigned *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)pixel.r; /* r */
         dst[1] = (unsigned)pixel.g; /* g */
         dst[2] = (unsigned)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r16g16b16_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)pixel.r; /* r */
         dst[1] = (unsigned)pixel.g; /* g */
         dst[2] = (unsigned)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_r16g16b16_uint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_uint pixel = {0};
         pixel.r = (uint16_t)CLAMP(src[0], 0, 65535);
         pixel.g = (uint16_t)CLAMP(src[1], 0, 65535);
         pixel.b = (uint16_t)CLAMP(src[2], 0, 65535);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r16g16b16_uint pixel = {0};
         pixel.r = (uint16_t)CLAMP(src[0], 0, 65535);
         pixel.g = (uint16_t)CLAMP(src[1], 0, 65535);
         pixel.b = (uint16_t)CLAMP(src[2], 0, 65535);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 6;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r16g16b16a16_uint {
#if UTIL_ARCH_BIG_ENDIAN
   uint16_t r;
   uint16_t g;
   uint16_t b;
   uint16_t a;
#else
   uint16_t r;
   uint16_t g;
   uint16_t b;
   uint16_t a;
#endif
};

void
util_format_r16g16b16a16_uint_unpack_unsigned(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   unsigned *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)pixel.r; /* r */
         dst[1] = (unsigned)pixel.g; /* g */
         dst[2] = (unsigned)pixel.b; /* b */
         dst[3] = (unsigned)pixel.a; /* a */
#else
         struct util_format_r16g16b16a16_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)pixel.r; /* r */
         dst[1] = (unsigned)pixel.g; /* g */
         dst[2] = (unsigned)pixel.b; /* b */
         dst[3] = (unsigned)pixel.a; /* a */
#endif
      src += 8;
      dst += 4;
   }
}

void
util_format_r16g16b16a16_uint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_uint pixel = {0};
         pixel.r = (uint16_t)MIN2(src[0], 65535);
         pixel.g = (uint16_t)MIN2(src[1], 65535);
         pixel.b = (uint16_t)MIN2(src[2], 65535);
         pixel.a = (uint16_t)MIN2(src[3], 65535);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r16g16b16a16_uint pixel = {0};
         pixel.r = (uint16_t)MIN2(src[0], 65535);
         pixel.g = (uint16_t)MIN2(src[1], 65535);
         pixel.b = (uint16_t)MIN2(src[2], 65535);
         pixel.a = (uint16_t)MIN2(src[3], 65535);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r16g16b16a16_uint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   unsigned *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)pixel.r; /* r */
         dst[1] = (unsigned)pixel.g; /* g */
         dst[2] = (unsigned)pixel.b; /* b */
         dst[3] = (unsigned)pixel.a; /* a */
#else
         struct util_format_r16g16b16a16_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)pixel.r; /* r */
         dst[1] = (unsigned)pixel.g; /* g */
         dst[2] = (unsigned)pixel.b; /* b */
         dst[3] = (unsigned)pixel.a; /* a */
#endif
}

void
util_format_r16g16b16a16_uint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_uint pixel = {0};
         pixel.r = (uint16_t)CLAMP(src[0], 0, 65535);
         pixel.g = (uint16_t)CLAMP(src[1], 0, 65535);
         pixel.b = (uint16_t)CLAMP(src[2], 0, 65535);
         pixel.a = (uint16_t)CLAMP(src[3], 0, 65535);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r16g16b16a16_uint pixel = {0};
         pixel.r = (uint16_t)CLAMP(src[0], 0, 65535);
         pixel.g = (uint16_t)CLAMP(src[1], 0, 65535);
         pixel.b = (uint16_t)CLAMP(src[2], 0, 65535);
         pixel.a = (uint16_t)CLAMP(src[3], 0, 65535);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r16_sint_unpack_signed(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   int *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         int16_t r = (int16_t)(value) ;
         dst[0] = (int)r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
      src += 2;
      dst += 4;
   }
}

void
util_format_r16_sint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)((int16_t)CLAMP(src[0], -32768, 32767)) ;
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r16_sint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   int *dst = in_dst;
         uint16_t value = *(const uint16_t *)src;
         int16_t r = (int16_t)(value) ;
         dst[0] = (int)r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
}

void
util_format_r16_sint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)((int16_t)MIN2(src[0], 32767)) ;
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r16g16_sint_unpack_signed(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   int *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t r = ((int32_t)(value) ) >> 16;
         int32_t g = ((int32_t)(value << 16) ) >> 16;
         dst[0] = (int)r; /* r */
         dst[1] = (int)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r = ((int32_t)(value << 16) ) >> 16;
         int32_t g = ((int32_t)(value) ) >> 16;
         dst[0] = (int)r; /* r */
         dst[1] = (int)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_r16g16_sint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((int16_t)CLAMP(src[0], -32768, 32767)) << 16) ;
         value |= (uint32_t)(((int16_t)CLAMP(src[1], -32768, 32767)) & 0xffff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((int16_t)CLAMP(src[0], -32768, 32767)) & 0xffff) ;
         value |= (uint32_t)((uint32_t)((int16_t)CLAMP(src[1], -32768, 32767)) << 16) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r16g16_sint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   int *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t r = ((int32_t)(value) ) >> 16;
         int32_t g = ((int32_t)(value << 16) ) >> 16;
         dst[0] = (int)r; /* r */
         dst[1] = (int)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r = ((int32_t)(value << 16) ) >> 16;
         int32_t g = ((int32_t)(value) ) >> 16;
         dst[0] = (int)r; /* r */
         dst[1] = (int)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_r16g16_sint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((int16_t)MIN2(src[0], 32767)) << 16) ;
         value |= (uint32_t)(((int16_t)MIN2(src[1], 32767)) & 0xffff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((int16_t)MIN2(src[0], 32767)) & 0xffff) ;
         value |= (uint32_t)((uint32_t)((int16_t)MIN2(src[1], 32767)) << 16) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r16g16b16_sint {
#if UTIL_ARCH_BIG_ENDIAN
   int16_t r;
   int16_t g;
   int16_t b;
#else
   int16_t r;
   int16_t g;
   int16_t b;
#endif
};

void
util_format_r16g16b16_sint_unpack_signed(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   int *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (int)pixel.r; /* r */
         dst[1] = (int)pixel.g; /* g */
         dst[2] = (int)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r16g16b16_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (int)pixel.r; /* r */
         dst[1] = (int)pixel.g; /* g */
         dst[2] = (int)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
      src += 6;
      dst += 4;
   }
}

void
util_format_r16g16b16_sint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_sint pixel = {0};
         pixel.r = (int16_t)CLAMP(src[0], -32768, 32767);
         pixel.g = (int16_t)CLAMP(src[1], -32768, 32767);
         pixel.b = (int16_t)CLAMP(src[2], -32768, 32767);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r16g16b16_sint pixel = {0};
         pixel.r = (int16_t)CLAMP(src[0], -32768, 32767);
         pixel.g = (int16_t)CLAMP(src[1], -32768, 32767);
         pixel.b = (int16_t)CLAMP(src[2], -32768, 32767);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 6;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r16g16b16_sint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   int *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (int)pixel.r; /* r */
         dst[1] = (int)pixel.g; /* g */
         dst[2] = (int)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r16g16b16_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (int)pixel.r; /* r */
         dst[1] = (int)pixel.g; /* g */
         dst[2] = (int)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_r16g16b16_sint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_sint pixel = {0};
         pixel.r = (int16_t)MIN2(src[0], 32767);
         pixel.g = (int16_t)MIN2(src[1], 32767);
         pixel.b = (int16_t)MIN2(src[2], 32767);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r16g16b16_sint pixel = {0};
         pixel.r = (int16_t)MIN2(src[0], 32767);
         pixel.g = (int16_t)MIN2(src[1], 32767);
         pixel.b = (int16_t)MIN2(src[2], 32767);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 6;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r16g16b16a16_sint {
#if UTIL_ARCH_BIG_ENDIAN
   int16_t r;
   int16_t g;
   int16_t b;
   int16_t a;
#else
   int16_t r;
   int16_t g;
   int16_t b;
   int16_t a;
#endif
};

void
util_format_r16g16b16a16_sint_unpack_signed(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   int *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (int)pixel.r; /* r */
         dst[1] = (int)pixel.g; /* g */
         dst[2] = (int)pixel.b; /* b */
         dst[3] = (int)pixel.a; /* a */
#else
         struct util_format_r16g16b16a16_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (int)pixel.r; /* r */
         dst[1] = (int)pixel.g; /* g */
         dst[2] = (int)pixel.b; /* b */
         dst[3] = (int)pixel.a; /* a */
#endif
      src += 8;
      dst += 4;
   }
}

void
util_format_r16g16b16a16_sint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_sint pixel = {0};
         pixel.r = (int16_t)CLAMP(src[0], -32768, 32767);
         pixel.g = (int16_t)CLAMP(src[1], -32768, 32767);
         pixel.b = (int16_t)CLAMP(src[2], -32768, 32767);
         pixel.a = (int16_t)CLAMP(src[3], -32768, 32767);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r16g16b16a16_sint pixel = {0};
         pixel.r = (int16_t)CLAMP(src[0], -32768, 32767);
         pixel.g = (int16_t)CLAMP(src[1], -32768, 32767);
         pixel.b = (int16_t)CLAMP(src[2], -32768, 32767);
         pixel.a = (int16_t)CLAMP(src[3], -32768, 32767);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r16g16b16a16_sint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   int *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (int)pixel.r; /* r */
         dst[1] = (int)pixel.g; /* g */
         dst[2] = (int)pixel.b; /* b */
         dst[3] = (int)pixel.a; /* a */
#else
         struct util_format_r16g16b16a16_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (int)pixel.r; /* r */
         dst[1] = (int)pixel.g; /* g */
         dst[2] = (int)pixel.b; /* b */
         dst[3] = (int)pixel.a; /* a */
#endif
}

void
util_format_r16g16b16a16_sint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_sint pixel = {0};
         pixel.r = (int16_t)MIN2(src[0], 32767);
         pixel.g = (int16_t)MIN2(src[1], 32767);
         pixel.b = (int16_t)MIN2(src[2], 32767);
         pixel.a = (int16_t)MIN2(src[3], 32767);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r16g16b16a16_sint pixel = {0};
         pixel.r = (int16_t)MIN2(src[0], 32767);
         pixel.g = (int16_t)MIN2(src[1], 32767);
         pixel.b = (int16_t)MIN2(src[2], 32767);
         pixel.a = (int16_t)MIN2(src[3], 32767);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r32_uint_unpack_unsigned(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   unsigned *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = value;
         dst[0] = r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
      src += 4;
      dst += 4;
   }
}

void
util_format_r32_uint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = 0;
         value |= src[0];
         *(uint32_t *)dst = value;
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r32_uint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   unsigned *dst = in_dst;
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = value;
         dst[0] = r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
}

void
util_format_r32_uint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = 0;
         value |= (uint32_t)MAX2(src[0], 0);
         *(uint32_t *)dst = value;
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r32g32_uint {
#if UTIL_ARCH_BIG_ENDIAN
   uint32_t r;
   uint32_t g;
#else
   uint32_t r;
   uint32_t g;
#endif
};

void
util_format_r32g32_uint_unpack_unsigned(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   unsigned *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
      src += 8;
      dst += 4;
   }
}

void
util_format_r32g32_uint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_uint pixel = {0};
         pixel.r = src[0];
         pixel.g = src[1];
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32_uint pixel = {0};
         pixel.r = src[0];
         pixel.g = src[1];
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r32g32_uint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   unsigned *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_r32g32_uint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_uint pixel = {0};
         pixel.r = (uint32_t)MAX2(src[0], 0);
         pixel.g = (uint32_t)MAX2(src[1], 0);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32_uint pixel = {0};
         pixel.r = (uint32_t)MAX2(src[0], 0);
         pixel.g = (uint32_t)MAX2(src[1], 0);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r32g32b32_uint {
#if UTIL_ARCH_BIG_ENDIAN
   uint32_t r;
   uint32_t g;
   uint32_t b;
#else
   uint32_t r;
   uint32_t g;
   uint32_t b;
#endif
};

void
util_format_r32g32b32_uint_unpack_unsigned(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   unsigned *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32b32_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
      src += 12;
      dst += 4;
   }
}

void
util_format_r32g32b32_uint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_uint pixel = {0};
         pixel.r = src[0];
         pixel.g = src[1];
         pixel.b = src[2];
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32_uint pixel = {0};
         pixel.r = src[0];
         pixel.g = src[1];
         pixel.b = src[2];
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 12;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r32g32b32_uint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   unsigned *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32b32_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_r32g32b32_uint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_uint pixel = {0};
         pixel.r = (uint32_t)MAX2(src[0], 0);
         pixel.g = (uint32_t)MAX2(src[1], 0);
         pixel.b = (uint32_t)MAX2(src[2], 0);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32_uint pixel = {0};
         pixel.r = (uint32_t)MAX2(src[0], 0);
         pixel.g = (uint32_t)MAX2(src[1], 0);
         pixel.b = (uint32_t)MAX2(src[2], 0);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 12;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r32g32b32a32_uint {
#if UTIL_ARCH_BIG_ENDIAN
   uint32_t r;
   uint32_t g;
   uint32_t b;
   uint32_t a;
#else
   uint32_t r;
   uint32_t g;
   uint32_t b;
   uint32_t a;
#endif
};

void
util_format_r32g32b32a32_uint_unpack_unsigned(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   unsigned *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = pixel.b; /* b */
         dst[3] = pixel.a; /* a */
#else
         struct util_format_r32g32b32a32_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = pixel.b; /* b */
         dst[3] = pixel.a; /* a */
#endif
      src += 16;
      dst += 4;
   }
}

void
util_format_r32g32b32a32_uint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_uint pixel = {0};
         pixel.r = src[0];
         pixel.g = src[1];
         pixel.b = src[2];
         pixel.a = src[3];
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32a32_uint pixel = {0};
         pixel.r = src[0];
         pixel.g = src[1];
         pixel.b = src[2];
         pixel.a = src[3];
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 16;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r32g32b32a32_uint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   unsigned *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = pixel.b; /* b */
         dst[3] = pixel.a; /* a */
#else
         struct util_format_r32g32b32a32_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = pixel.b; /* b */
         dst[3] = pixel.a; /* a */
#endif
}

void
util_format_r32g32b32a32_uint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_uint pixel = {0};
         pixel.r = (uint32_t)MAX2(src[0], 0);
         pixel.g = (uint32_t)MAX2(src[1], 0);
         pixel.b = (uint32_t)MAX2(src[2], 0);
         pixel.a = (uint32_t)MAX2(src[3], 0);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32a32_uint pixel = {0};
         pixel.r = (uint32_t)MAX2(src[0], 0);
         pixel.g = (uint32_t)MAX2(src[1], 0);
         pixel.b = (uint32_t)MAX2(src[2], 0);
         pixel.a = (uint32_t)MAX2(src[3], 0);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 16;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r32_sint_unpack_signed(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   int *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint32_t value = *(const uint32_t *)src;
         int32_t r = (int32_t)(value) ;
         dst[0] = r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
      src += 4;
      dst += 4;
   }
}

void
util_format_r32_sint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = 0;
         value |= (uint32_t)(src[0]) ;
         *(uint32_t *)dst = value;
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r32_sint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   int *dst = in_dst;
         uint32_t value = *(const uint32_t *)src;
         int32_t r = (int32_t)(value) ;
         dst[0] = r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
}

void
util_format_r32_sint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = 0;
         value |= (uint32_t)((int32_t)MIN2(src[0], 2147483647)) ;
         *(uint32_t *)dst = value;
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r32g32_sint {
#if UTIL_ARCH_BIG_ENDIAN
   int32_t r;
   int32_t g;
#else
   int32_t r;
   int32_t g;
#endif
};

void
util_format_r32g32_sint_unpack_signed(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   int *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
      src += 8;
      dst += 4;
   }
}

void
util_format_r32g32_sint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_sint pixel = {0};
         pixel.r = src[0];
         pixel.g = src[1];
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32_sint pixel = {0};
         pixel.r = src[0];
         pixel.g = src[1];
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r32g32_sint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   int *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_r32g32_sint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_sint pixel = {0};
         pixel.r = (int32_t)MIN2(src[0], 2147483647);
         pixel.g = (int32_t)MIN2(src[1], 2147483647);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32_sint pixel = {0};
         pixel.r = (int32_t)MIN2(src[0], 2147483647);
         pixel.g = (int32_t)MIN2(src[1], 2147483647);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r32g32b32_sint {
#if UTIL_ARCH_BIG_ENDIAN
   int32_t r;
   int32_t g;
   int32_t b;
#else
   int32_t r;
   int32_t g;
   int32_t b;
#endif
};

void
util_format_r32g32b32_sint_unpack_signed(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   int *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32b32_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
      src += 12;
      dst += 4;
   }
}

void
util_format_r32g32b32_sint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_sint pixel = {0};
         pixel.r = src[0];
         pixel.g = src[1];
         pixel.b = src[2];
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32_sint pixel = {0};
         pixel.r = src[0];
         pixel.g = src[1];
         pixel.b = src[2];
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 12;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r32g32b32_sint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   int *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32b32_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_r32g32b32_sint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_sint pixel = {0};
         pixel.r = (int32_t)MIN2(src[0], 2147483647);
         pixel.g = (int32_t)MIN2(src[1], 2147483647);
         pixel.b = (int32_t)MIN2(src[2], 2147483647);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32_sint pixel = {0};
         pixel.r = (int32_t)MIN2(src[0], 2147483647);
         pixel.g = (int32_t)MIN2(src[1], 2147483647);
         pixel.b = (int32_t)MIN2(src[2], 2147483647);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 12;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r32g32b32a32_sint {
#if UTIL_ARCH_BIG_ENDIAN
   int32_t r;
   int32_t g;
   int32_t b;
   int32_t a;
#else
   int32_t r;
   int32_t g;
   int32_t b;
   int32_t a;
#endif
};

void
util_format_r32g32b32a32_sint_unpack_signed(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   int *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = pixel.b; /* b */
         dst[3] = pixel.a; /* a */
#else
         struct util_format_r32g32b32a32_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = pixel.b; /* b */
         dst[3] = pixel.a; /* a */
#endif
      src += 16;
      dst += 4;
   }
}

void
util_format_r32g32b32a32_sint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_sint pixel = {0};
         pixel.r = src[0];
         pixel.g = src[1];
         pixel.b = src[2];
         pixel.a = src[3];
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32a32_sint pixel = {0};
         pixel.r = src[0];
         pixel.g = src[1];
         pixel.b = src[2];
         pixel.a = src[3];
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 16;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r32g32b32a32_sint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   int *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = pixel.b; /* b */
         dst[3] = pixel.a; /* a */
#else
         struct util_format_r32g32b32a32_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = pixel.b; /* b */
         dst[3] = pixel.a; /* a */
#endif
}

void
util_format_r32g32b32a32_sint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_sint pixel = {0};
         pixel.r = (int32_t)MIN2(src[0], 2147483647);
         pixel.g = (int32_t)MIN2(src[1], 2147483647);
         pixel.b = (int32_t)MIN2(src[2], 2147483647);
         pixel.a = (int32_t)MIN2(src[3], 2147483647);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32a32_sint pixel = {0};
         pixel.r = (int32_t)MIN2(src[0], 2147483647);
         pixel.g = (int32_t)MIN2(src[1], 2147483647);
         pixel.b = (int32_t)MIN2(src[2], 2147483647);
         pixel.a = (int32_t)MIN2(src[3], 2147483647);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 16;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r64_uint {
   uint64_t r;
};

void
util_format_r64_uint_unpack_unsigned(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   unsigned *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         struct util_format_r64_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)MIN2(pixel.r, 4294967295); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
      src += 8;
      dst += 4;
   }
}

void
util_format_r64_uint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_r64_uint pixel = {0};
         pixel.r = (uint64_t)src[0];
         memcpy(dst, &pixel, sizeof pixel);
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r64_uint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   unsigned *dst = in_dst;
         struct util_format_r64_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)MIN2(pixel.r, 4294967295); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
}

void
util_format_r64_uint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_r64_uint pixel = {0};
         pixel.r = (uint64_t)MAX2(src[0], 0);
         memcpy(dst, &pixel, sizeof pixel);
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r64g64_uint {
#if UTIL_ARCH_BIG_ENDIAN
   uint64_t r;
   uint64_t g;
#else
   uint64_t r;
   uint64_t g;
#endif
};

void
util_format_r64g64_uint_unpack_unsigned(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   unsigned *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r64g64_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)MIN2(pixel.r, 4294967295); /* r */
         dst[1] = (unsigned)MIN2(pixel.g, 4294967295); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r64g64_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)MIN2(pixel.r, 4294967295); /* r */
         dst[1] = (unsigned)MIN2(pixel.g, 4294967295); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
      src += 16;
      dst += 4;
   }
}

void
util_format_r64g64_uint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r64g64_uint pixel = {0};
         pixel.r = (uint64_t)src[0];
         pixel.g = (uint64_t)src[1];
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r64g64_uint pixel = {0};
         pixel.r = (uint64_t)src[0];
         pixel.g = (uint64_t)src[1];
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 16;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r64g64_uint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   unsigned *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r64g64_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)MIN2(pixel.r, 4294967295); /* r */
         dst[1] = (unsigned)MIN2(pixel.g, 4294967295); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r64g64_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)MIN2(pixel.r, 4294967295); /* r */
         dst[1] = (unsigned)MIN2(pixel.g, 4294967295); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_r64g64_uint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r64g64_uint pixel = {0};
         pixel.r = (uint64_t)MAX2(src[0], 0);
         pixel.g = (uint64_t)MAX2(src[1], 0);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r64g64_uint pixel = {0};
         pixel.r = (uint64_t)MAX2(src[0], 0);
         pixel.g = (uint64_t)MAX2(src[1], 0);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 16;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r64g64b64_uint {
#if UTIL_ARCH_BIG_ENDIAN
   uint64_t r;
   uint64_t g;
   uint64_t b;
#else
   uint64_t r;
   uint64_t g;
   uint64_t b;
#endif
};

void
util_format_r64g64b64_uint_unpack_unsigned(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   unsigned *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r64g64b64_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)MIN2(pixel.r, 4294967295); /* r */
         dst[1] = (unsigned)MIN2(pixel.g, 4294967295); /* g */
         dst[2] = (unsigned)MIN2(pixel.b, 4294967295); /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r64g64b64_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)MIN2(pixel.r, 4294967295); /* r */
         dst[1] = (unsigned)MIN2(pixel.g, 4294967295); /* g */
         dst[2] = (unsigned)MIN2(pixel.b, 4294967295); /* b */
         dst[3] = 1; /* a */
#endif
      src += 24;
      dst += 4;
   }
}

void
util_format_r64g64b64_uint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r64g64b64_uint pixel = {0};
         pixel.r = (uint64_t)src[0];
         pixel.g = (uint64_t)src[1];
         pixel.b = (uint64_t)src[2];
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r64g64b64_uint pixel = {0};
         pixel.r = (uint64_t)src[0];
         pixel.g = (uint64_t)src[1];
         pixel.b = (uint64_t)src[2];
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 24;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r64g64b64_uint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   unsigned *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r64g64b64_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)MIN2(pixel.r, 4294967295); /* r */
         dst[1] = (unsigned)MIN2(pixel.g, 4294967295); /* g */
         dst[2] = (unsigned)MIN2(pixel.b, 4294967295); /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r64g64b64_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)MIN2(pixel.r, 4294967295); /* r */
         dst[1] = (unsigned)MIN2(pixel.g, 4294967295); /* g */
         dst[2] = (unsigned)MIN2(pixel.b, 4294967295); /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_r64g64b64_uint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r64g64b64_uint pixel = {0};
         pixel.r = (uint64_t)MAX2(src[0], 0);
         pixel.g = (uint64_t)MAX2(src[1], 0);
         pixel.b = (uint64_t)MAX2(src[2], 0);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r64g64b64_uint pixel = {0};
         pixel.r = (uint64_t)MAX2(src[0], 0);
         pixel.g = (uint64_t)MAX2(src[1], 0);
         pixel.b = (uint64_t)MAX2(src[2], 0);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 24;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r64g64b64a64_uint {
#if UTIL_ARCH_BIG_ENDIAN
   uint64_t r;
   uint64_t g;
   uint64_t b;
   uint64_t a;
#else
   uint64_t r;
   uint64_t g;
   uint64_t b;
   uint64_t a;
#endif
};

void
util_format_r64g64b64a64_uint_unpack_unsigned(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   unsigned *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r64g64b64a64_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)MIN2(pixel.r, 4294967295); /* r */
         dst[1] = (unsigned)MIN2(pixel.g, 4294967295); /* g */
         dst[2] = (unsigned)MIN2(pixel.b, 4294967295); /* b */
         dst[3] = (unsigned)MIN2(pixel.a, 4294967295); /* a */
#else
         struct util_format_r64g64b64a64_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)MIN2(pixel.r, 4294967295); /* r */
         dst[1] = (unsigned)MIN2(pixel.g, 4294967295); /* g */
         dst[2] = (unsigned)MIN2(pixel.b, 4294967295); /* b */
         dst[3] = (unsigned)MIN2(pixel.a, 4294967295); /* a */
#endif
      src += 32;
      dst += 4;
   }
}

void
util_format_r64g64b64a64_uint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r64g64b64a64_uint pixel = {0};
         pixel.r = (uint64_t)src[0];
         pixel.g = (uint64_t)src[1];
         pixel.b = (uint64_t)src[2];
         pixel.a = (uint64_t)src[3];
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r64g64b64a64_uint pixel = {0};
         pixel.r = (uint64_t)src[0];
         pixel.g = (uint64_t)src[1];
         pixel.b = (uint64_t)src[2];
         pixel.a = (uint64_t)src[3];
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 32;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r64g64b64a64_uint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   unsigned *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r64g64b64a64_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)MIN2(pixel.r, 4294967295); /* r */
         dst[1] = (unsigned)MIN2(pixel.g, 4294967295); /* g */
         dst[2] = (unsigned)MIN2(pixel.b, 4294967295); /* b */
         dst[3] = (unsigned)MIN2(pixel.a, 4294967295); /* a */
#else
         struct util_format_r64g64b64a64_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)MIN2(pixel.r, 4294967295); /* r */
         dst[1] = (unsigned)MIN2(pixel.g, 4294967295); /* g */
         dst[2] = (unsigned)MIN2(pixel.b, 4294967295); /* b */
         dst[3] = (unsigned)MIN2(pixel.a, 4294967295); /* a */
#endif
}

void
util_format_r64g64b64a64_uint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r64g64b64a64_uint pixel = {0};
         pixel.r = (uint64_t)MAX2(src[0], 0);
         pixel.g = (uint64_t)MAX2(src[1], 0);
         pixel.b = (uint64_t)MAX2(src[2], 0);
         pixel.a = (uint64_t)MAX2(src[3], 0);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r64g64b64a64_uint pixel = {0};
         pixel.r = (uint64_t)MAX2(src[0], 0);
         pixel.g = (uint64_t)MAX2(src[1], 0);
         pixel.b = (uint64_t)MAX2(src[2], 0);
         pixel.a = (uint64_t)MAX2(src[3], 0);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 32;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r64_sint {
   int64_t r;
};

void
util_format_r64_sint_unpack_signed(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   int *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         struct util_format_r64_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (int)CLAMP(pixel.r, -2147483648, 2147483647); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
      src += 8;
      dst += 4;
   }
}

void
util_format_r64_sint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_r64_sint pixel = {0};
         pixel.r = (int64_t)src[0];
         memcpy(dst, &pixel, sizeof pixel);
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r64_sint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   int *dst = in_dst;
         struct util_format_r64_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (int)CLAMP(pixel.r, -2147483648, 2147483647); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
}

void
util_format_r64_sint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_r64_sint pixel = {0};
         pixel.r = (int64_t)src[0];
         memcpy(dst, &pixel, sizeof pixel);
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a8_uint_unpack_unsigned(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   unsigned *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         uint8_t a = value;
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = (unsigned)a; /* a */
      src += 1;
      dst += 4;
   }
}

void
util_format_a8_uint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= (uint8_t)MIN2(src[3], 255);
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a8_uint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   unsigned *dst = in_dst;
         uint8_t value = *(const uint8_t *)src;
         uint8_t a = value;
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = (unsigned)a; /* a */
}

void
util_format_a8_uint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= (uint8_t)CLAMP(src[3], 0, 255);
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_i8_uint_unpack_unsigned(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   unsigned *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         uint8_t rgba = value;
         dst[0] = (unsigned)rgba; /* r */
         dst[1] = (unsigned)rgba; /* g */
         dst[2] = (unsigned)rgba; /* b */
         dst[3] = (unsigned)rgba; /* a */
      src += 1;
      dst += 4;
   }
}

void
util_format_i8_uint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= (uint8_t)MIN2(src[0], 255);
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_i8_uint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   unsigned *dst = in_dst;
         uint8_t value = *(const uint8_t *)src;
         uint8_t rgba = value;
         dst[0] = (unsigned)rgba; /* r */
         dst[1] = (unsigned)rgba; /* g */
         dst[2] = (unsigned)rgba; /* b */
         dst[3] = (unsigned)rgba; /* a */
}

void
util_format_i8_uint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= (uint8_t)CLAMP(src[0], 0, 255);
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_l8_uint_unpack_unsigned(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   unsigned *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         uint8_t rgb = value;
         dst[0] = (unsigned)rgb; /* r */
         dst[1] = (unsigned)rgb; /* g */
         dst[2] = (unsigned)rgb; /* b */
         dst[3] = 1; /* a */
      src += 1;
      dst += 4;
   }
}

void
util_format_l8_uint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= (uint8_t)MIN2(src[0], 255);
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_l8_uint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   unsigned *dst = in_dst;
         uint8_t value = *(const uint8_t *)src;
         uint8_t rgb = value;
         dst[0] = (unsigned)rgb; /* r */
         dst[1] = (unsigned)rgb; /* g */
         dst[2] = (unsigned)rgb; /* b */
         dst[3] = 1; /* a */
}

void
util_format_l8_uint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= (uint8_t)CLAMP(src[0], 0, 255);
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_l8a8_uint_unpack_unsigned(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   unsigned *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t rgb = value >> 8;
         uint16_t a = (value) & 0xff;
         dst[0] = (unsigned)rgb; /* r */
         dst[1] = (unsigned)rgb; /* g */
         dst[2] = (unsigned)rgb; /* b */
         dst[3] = (unsigned)a; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t rgb = (value) & 0xff;
         uint16_t a = value >> 8;
         dst[0] = (unsigned)rgb; /* r */
         dst[1] = (unsigned)rgb; /* g */
         dst[2] = (unsigned)rgb; /* b */
         dst[3] = (unsigned)a; /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_l8a8_uint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint8_t)MIN2(src[0], 255)) << 8;
         value |= ((uint8_t)MIN2(src[3], 255)) & 0xff;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint8_t)MIN2(src[0], 255)) & 0xff;
         value |= (uint32_t)((uint8_t)MIN2(src[3], 255)) << 8;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_l8a8_uint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   unsigned *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t rgb = value >> 8;
         uint16_t a = (value) & 0xff;
         dst[0] = (unsigned)rgb; /* r */
         dst[1] = (unsigned)rgb; /* g */
         dst[2] = (unsigned)rgb; /* b */
         dst[3] = (unsigned)a; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t rgb = (value) & 0xff;
         uint16_t a = value >> 8;
         dst[0] = (unsigned)rgb; /* r */
         dst[1] = (unsigned)rgb; /* g */
         dst[2] = (unsigned)rgb; /* b */
         dst[3] = (unsigned)a; /* a */
#endif
}

void
util_format_l8a8_uint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint8_t)CLAMP(src[0], 0, 255)) << 8;
         value |= ((uint8_t)CLAMP(src[3], 0, 255)) & 0xff;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint8_t)CLAMP(src[0], 0, 255)) & 0xff;
         value |= (uint32_t)((uint8_t)CLAMP(src[3], 0, 255)) << 8;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a8_sint_unpack_signed(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   int *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         int8_t a = (int8_t)(value) ;
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = (int)a; /* a */
      src += 1;
      dst += 4;
   }
}

void
util_format_a8_sint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= (uint8_t)((int8_t)CLAMP(src[3], -128, 127)) ;
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a8_sint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   int *dst = in_dst;
         uint8_t value = *(const uint8_t *)src;
         int8_t a = (int8_t)(value) ;
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = (int)a; /* a */
}

void
util_format_a8_sint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= (uint8_t)((int8_t)MIN2(src[3], 127)) ;
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_i8_sint_unpack_signed(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   int *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         int8_t rgba = (int8_t)(value) ;
         dst[0] = (int)rgba; /* r */
         dst[1] = (int)rgba; /* g */
         dst[2] = (int)rgba; /* b */
         dst[3] = (int)rgba; /* a */
      src += 1;
      dst += 4;
   }
}

void
util_format_i8_sint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= (uint8_t)((int8_t)CLAMP(src[0], -128, 127)) ;
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_i8_sint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   int *dst = in_dst;
         uint8_t value = *(const uint8_t *)src;
         int8_t rgba = (int8_t)(value) ;
         dst[0] = (int)rgba; /* r */
         dst[1] = (int)rgba; /* g */
         dst[2] = (int)rgba; /* b */
         dst[3] = (int)rgba; /* a */
}

void
util_format_i8_sint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= (uint8_t)((int8_t)MIN2(src[0], 127)) ;
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_l8_sint_unpack_signed(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   int *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         int8_t rgb = (int8_t)(value) ;
         dst[0] = (int)rgb; /* r */
         dst[1] = (int)rgb; /* g */
         dst[2] = (int)rgb; /* b */
         dst[3] = 1; /* a */
      src += 1;
      dst += 4;
   }
}

void
util_format_l8_sint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= (uint8_t)((int8_t)CLAMP(src[0], -128, 127)) ;
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_l8_sint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   int *dst = in_dst;
         uint8_t value = *(const uint8_t *)src;
         int8_t rgb = (int8_t)(value) ;
         dst[0] = (int)rgb; /* r */
         dst[1] = (int)rgb; /* g */
         dst[2] = (int)rgb; /* b */
         dst[3] = 1; /* a */
}

void
util_format_l8_sint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= (uint8_t)((int8_t)MIN2(src[0], 127)) ;
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_l8a8_sint_unpack_signed(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   int *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         int16_t rgb = ((int16_t)(value) ) >> 8;
         int16_t a = ((int16_t)(value << 8) ) >> 8;
         dst[0] = (int)rgb; /* r */
         dst[1] = (int)rgb; /* g */
         dst[2] = (int)rgb; /* b */
         dst[3] = (int)a; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         int16_t rgb = ((int16_t)(value << 8) ) >> 8;
         int16_t a = ((int16_t)(value) ) >> 8;
         dst[0] = (int)rgb; /* r */
         dst[1] = (int)rgb; /* g */
         dst[2] = (int)rgb; /* b */
         dst[3] = (int)a; /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_l8a8_sint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint16_t)((uint32_t)((int8_t)CLAMP(src[0], -128, 127)) << 8) ;
         value |= (uint16_t)(((int8_t)CLAMP(src[3], -128, 127)) & 0xff) ;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (uint16_t)(((int8_t)CLAMP(src[0], -128, 127)) & 0xff) ;
         value |= (uint16_t)((uint32_t)((int8_t)CLAMP(src[3], -128, 127)) << 8) ;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_l8a8_sint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   int *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         int16_t rgb = ((int16_t)(value) ) >> 8;
         int16_t a = ((int16_t)(value << 8) ) >> 8;
         dst[0] = (int)rgb; /* r */
         dst[1] = (int)rgb; /* g */
         dst[2] = (int)rgb; /* b */
         dst[3] = (int)a; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         int16_t rgb = ((int16_t)(value << 8) ) >> 8;
         int16_t a = ((int16_t)(value) ) >> 8;
         dst[0] = (int)rgb; /* r */
         dst[1] = (int)rgb; /* g */
         dst[2] = (int)rgb; /* b */
         dst[3] = (int)a; /* a */
#endif
}

void
util_format_l8a8_sint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint16_t)((uint32_t)((int8_t)MIN2(src[0], 127)) << 8) ;
         value |= (uint16_t)(((int8_t)MIN2(src[3], 127)) & 0xff) ;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (uint16_t)(((int8_t)MIN2(src[0], 127)) & 0xff) ;
         value |= (uint16_t)((uint32_t)((int8_t)MIN2(src[3], 127)) << 8) ;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a16_uint_unpack_unsigned(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   unsigned *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         uint16_t a = value;
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = (unsigned)a; /* a */
      src += 2;
      dst += 4;
   }
}

void
util_format_a16_uint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)MIN2(src[3], 65535);
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a16_uint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   unsigned *dst = in_dst;
         uint16_t value = *(const uint16_t *)src;
         uint16_t a = value;
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = (unsigned)a; /* a */
}

void
util_format_a16_uint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)CLAMP(src[3], 0, 65535);
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_i16_uint_unpack_unsigned(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   unsigned *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         uint16_t rgba = value;
         dst[0] = (unsigned)rgba; /* r */
         dst[1] = (unsigned)rgba; /* g */
         dst[2] = (unsigned)rgba; /* b */
         dst[3] = (unsigned)rgba; /* a */
      src += 2;
      dst += 4;
   }
}

void
util_format_i16_uint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)MIN2(src[0], 65535);
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_i16_uint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   unsigned *dst = in_dst;
         uint16_t value = *(const uint16_t *)src;
         uint16_t rgba = value;
         dst[0] = (unsigned)rgba; /* r */
         dst[1] = (unsigned)rgba; /* g */
         dst[2] = (unsigned)rgba; /* b */
         dst[3] = (unsigned)rgba; /* a */
}

void
util_format_i16_uint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)CLAMP(src[0], 0, 65535);
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_l16_uint_unpack_unsigned(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   unsigned *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         uint16_t rgb = value;
         dst[0] = (unsigned)rgb; /* r */
         dst[1] = (unsigned)rgb; /* g */
         dst[2] = (unsigned)rgb; /* b */
         dst[3] = 1; /* a */
      src += 2;
      dst += 4;
   }
}

void
util_format_l16_uint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)MIN2(src[0], 65535);
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_l16_uint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   unsigned *dst = in_dst;
         uint16_t value = *(const uint16_t *)src;
         uint16_t rgb = value;
         dst[0] = (unsigned)rgb; /* r */
         dst[1] = (unsigned)rgb; /* g */
         dst[2] = (unsigned)rgb; /* b */
         dst[3] = 1; /* a */
}

void
util_format_l16_uint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)CLAMP(src[0], 0, 65535);
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_l16a16_uint_unpack_unsigned(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   unsigned *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t rgb = value >> 16;
         uint32_t a = (value) & 0xffff;
         dst[0] = (unsigned)rgb; /* r */
         dst[1] = (unsigned)rgb; /* g */
         dst[2] = (unsigned)rgb; /* b */
         dst[3] = (unsigned)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t rgb = (value) & 0xffff;
         uint32_t a = value >> 16;
         dst[0] = (unsigned)rgb; /* r */
         dst[1] = (unsigned)rgb; /* g */
         dst[2] = (unsigned)rgb; /* b */
         dst[3] = (unsigned)a; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_l16a16_uint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint16_t)MIN2(src[0], 65535)) << 16;
         value |= ((uint16_t)MIN2(src[3], 65535)) & 0xffff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint16_t)MIN2(src[0], 65535)) & 0xffff;
         value |= (uint32_t)((uint16_t)MIN2(src[3], 65535)) << 16;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_l16a16_uint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   unsigned *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t rgb = value >> 16;
         uint32_t a = (value) & 0xffff;
         dst[0] = (unsigned)rgb; /* r */
         dst[1] = (unsigned)rgb; /* g */
         dst[2] = (unsigned)rgb; /* b */
         dst[3] = (unsigned)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t rgb = (value) & 0xffff;
         uint32_t a = value >> 16;
         dst[0] = (unsigned)rgb; /* r */
         dst[1] = (unsigned)rgb; /* g */
         dst[2] = (unsigned)rgb; /* b */
         dst[3] = (unsigned)a; /* a */
#endif
}

void
util_format_l16a16_uint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint16_t)CLAMP(src[0], 0, 65535)) << 16;
         value |= ((uint16_t)CLAMP(src[3], 0, 65535)) & 0xffff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint16_t)CLAMP(src[0], 0, 65535)) & 0xffff;
         value |= (uint32_t)((uint16_t)CLAMP(src[3], 0, 65535)) << 16;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a16_sint_unpack_signed(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   int *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         int16_t a = (int16_t)(value) ;
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = (int)a; /* a */
      src += 2;
      dst += 4;
   }
}

void
util_format_a16_sint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)((int16_t)CLAMP(src[3], -32768, 32767)) ;
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a16_sint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   int *dst = in_dst;
         uint16_t value = *(const uint16_t *)src;
         int16_t a = (int16_t)(value) ;
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = (int)a; /* a */
}

void
util_format_a16_sint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)((int16_t)MIN2(src[3], 32767)) ;
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_i16_sint_unpack_signed(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   int *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         int16_t rgba = (int16_t)(value) ;
         dst[0] = (int)rgba; /* r */
         dst[1] = (int)rgba; /* g */
         dst[2] = (int)rgba; /* b */
         dst[3] = (int)rgba; /* a */
      src += 2;
      dst += 4;
   }
}

void
util_format_i16_sint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)((int16_t)CLAMP(src[0], -32768, 32767)) ;
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_i16_sint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   int *dst = in_dst;
         uint16_t value = *(const uint16_t *)src;
         int16_t rgba = (int16_t)(value) ;
         dst[0] = (int)rgba; /* r */
         dst[1] = (int)rgba; /* g */
         dst[2] = (int)rgba; /* b */
         dst[3] = (int)rgba; /* a */
}

void
util_format_i16_sint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)((int16_t)MIN2(src[0], 32767)) ;
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_l16_sint_unpack_signed(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   int *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         int16_t rgb = (int16_t)(value) ;
         dst[0] = (int)rgb; /* r */
         dst[1] = (int)rgb; /* g */
         dst[2] = (int)rgb; /* b */
         dst[3] = 1; /* a */
      src += 2;
      dst += 4;
   }
}

void
util_format_l16_sint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)((int16_t)CLAMP(src[0], -32768, 32767)) ;
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_l16_sint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   int *dst = in_dst;
         uint16_t value = *(const uint16_t *)src;
         int16_t rgb = (int16_t)(value) ;
         dst[0] = (int)rgb; /* r */
         dst[1] = (int)rgb; /* g */
         dst[2] = (int)rgb; /* b */
         dst[3] = 1; /* a */
}

void
util_format_l16_sint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)((int16_t)MIN2(src[0], 32767)) ;
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_l16a16_sint_unpack_signed(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   int *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t rgb = ((int32_t)(value) ) >> 16;
         int32_t a = ((int32_t)(value << 16) ) >> 16;
         dst[0] = (int)rgb; /* r */
         dst[1] = (int)rgb; /* g */
         dst[2] = (int)rgb; /* b */
         dst[3] = (int)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t rgb = ((int32_t)(value << 16) ) >> 16;
         int32_t a = ((int32_t)(value) ) >> 16;
         dst[0] = (int)rgb; /* r */
         dst[1] = (int)rgb; /* g */
         dst[2] = (int)rgb; /* b */
         dst[3] = (int)a; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_l16a16_sint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((int16_t)CLAMP(src[0], -32768, 32767)) << 16) ;
         value |= (uint32_t)(((int16_t)CLAMP(src[3], -32768, 32767)) & 0xffff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((int16_t)CLAMP(src[0], -32768, 32767)) & 0xffff) ;
         value |= (uint32_t)((uint32_t)((int16_t)CLAMP(src[3], -32768, 32767)) << 16) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_l16a16_sint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   int *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t rgb = ((int32_t)(value) ) >> 16;
         int32_t a = ((int32_t)(value << 16) ) >> 16;
         dst[0] = (int)rgb; /* r */
         dst[1] = (int)rgb; /* g */
         dst[2] = (int)rgb; /* b */
         dst[3] = (int)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t rgb = ((int32_t)(value << 16) ) >> 16;
         int32_t a = ((int32_t)(value) ) >> 16;
         dst[0] = (int)rgb; /* r */
         dst[1] = (int)rgb; /* g */
         dst[2] = (int)rgb; /* b */
         dst[3] = (int)a; /* a */
#endif
}

void
util_format_l16a16_sint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((int16_t)MIN2(src[0], 32767)) << 16) ;
         value |= (uint32_t)(((int16_t)MIN2(src[3], 32767)) & 0xffff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((int16_t)MIN2(src[0], 32767)) & 0xffff) ;
         value |= (uint32_t)((uint32_t)((int16_t)MIN2(src[3], 32767)) << 16) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a32_uint_unpack_unsigned(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   unsigned *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = value;
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = a; /* a */
      src += 4;
      dst += 4;
   }
}

void
util_format_a32_uint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = 0;
         value |= src[3];
         *(uint32_t *)dst = value;
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a32_uint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   unsigned *dst = in_dst;
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = value;
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = a; /* a */
}

void
util_format_a32_uint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = 0;
         value |= (uint32_t)MAX2(src[3], 0);
         *(uint32_t *)dst = value;
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_i32_uint_unpack_unsigned(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   unsigned *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint32_t value = *(const uint32_t *)src;
         uint32_t rgba = value;
         dst[0] = rgba; /* r */
         dst[1] = rgba; /* g */
         dst[2] = rgba; /* b */
         dst[3] = rgba; /* a */
      src += 4;
      dst += 4;
   }
}

void
util_format_i32_uint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = 0;
         value |= src[0];
         *(uint32_t *)dst = value;
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_i32_uint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   unsigned *dst = in_dst;
         uint32_t value = *(const uint32_t *)src;
         uint32_t rgba = value;
         dst[0] = rgba; /* r */
         dst[1] = rgba; /* g */
         dst[2] = rgba; /* b */
         dst[3] = rgba; /* a */
}

void
util_format_i32_uint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = 0;
         value |= (uint32_t)MAX2(src[0], 0);
         *(uint32_t *)dst = value;
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_l32_uint_unpack_unsigned(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   unsigned *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint32_t value = *(const uint32_t *)src;
         uint32_t rgb = value;
         dst[0] = rgb; /* r */
         dst[1] = rgb; /* g */
         dst[2] = rgb; /* b */
         dst[3] = 1; /* a */
      src += 4;
      dst += 4;
   }
}

void
util_format_l32_uint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = 0;
         value |= src[0];
         *(uint32_t *)dst = value;
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_l32_uint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   unsigned *dst = in_dst;
         uint32_t value = *(const uint32_t *)src;
         uint32_t rgb = value;
         dst[0] = rgb; /* r */
         dst[1] = rgb; /* g */
         dst[2] = rgb; /* b */
         dst[3] = 1; /* a */
}

void
util_format_l32_uint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = 0;
         value |= (uint32_t)MAX2(src[0], 0);
         *(uint32_t *)dst = value;
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_l32a32_uint {
#if UTIL_ARCH_BIG_ENDIAN
   uint32_t rgb;
   uint32_t a;
#else
   uint32_t rgb;
   uint32_t a;
#endif
};

void
util_format_l32a32_uint_unpack_unsigned(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   unsigned *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_l32a32_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.rgb; /* r */
         dst[1] = pixel.rgb; /* g */
         dst[2] = pixel.rgb; /* b */
         dst[3] = pixel.a; /* a */
#else
         struct util_format_l32a32_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.rgb; /* r */
         dst[1] = pixel.rgb; /* g */
         dst[2] = pixel.rgb; /* b */
         dst[3] = pixel.a; /* a */
#endif
      src += 8;
      dst += 4;
   }
}

void
util_format_l32a32_uint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_l32a32_uint pixel = {0};
         pixel.rgb = src[0];
         pixel.a = src[3];
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_l32a32_uint pixel = {0};
         pixel.rgb = src[0];
         pixel.a = src[3];
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_l32a32_uint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   unsigned *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_l32a32_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.rgb; /* r */
         dst[1] = pixel.rgb; /* g */
         dst[2] = pixel.rgb; /* b */
         dst[3] = pixel.a; /* a */
#else
         struct util_format_l32a32_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.rgb; /* r */
         dst[1] = pixel.rgb; /* g */
         dst[2] = pixel.rgb; /* b */
         dst[3] = pixel.a; /* a */
#endif
}

void
util_format_l32a32_uint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_l32a32_uint pixel = {0};
         pixel.rgb = (uint32_t)MAX2(src[0], 0);
         pixel.a = (uint32_t)MAX2(src[3], 0);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_l32a32_uint pixel = {0};
         pixel.rgb = (uint32_t)MAX2(src[0], 0);
         pixel.a = (uint32_t)MAX2(src[3], 0);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a32_sint_unpack_signed(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   int *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint32_t value = *(const uint32_t *)src;
         int32_t a = (int32_t)(value) ;
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = a; /* a */
      src += 4;
      dst += 4;
   }
}

void
util_format_a32_sint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = 0;
         value |= (uint32_t)(src[3]) ;
         *(uint32_t *)dst = value;
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a32_sint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   int *dst = in_dst;
         uint32_t value = *(const uint32_t *)src;
         int32_t a = (int32_t)(value) ;
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = a; /* a */
}

void
util_format_a32_sint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = 0;
         value |= (uint32_t)((int32_t)MIN2(src[3], 2147483647)) ;
         *(uint32_t *)dst = value;
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_i32_sint_unpack_signed(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   int *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint32_t value = *(const uint32_t *)src;
         int32_t rgba = (int32_t)(value) ;
         dst[0] = rgba; /* r */
         dst[1] = rgba; /* g */
         dst[2] = rgba; /* b */
         dst[3] = rgba; /* a */
      src += 4;
      dst += 4;
   }
}

void
util_format_i32_sint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = 0;
         value |= (uint32_t)(src[0]) ;
         *(uint32_t *)dst = value;
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_i32_sint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   int *dst = in_dst;
         uint32_t value = *(const uint32_t *)src;
         int32_t rgba = (int32_t)(value) ;
         dst[0] = rgba; /* r */
         dst[1] = rgba; /* g */
         dst[2] = rgba; /* b */
         dst[3] = rgba; /* a */
}

void
util_format_i32_sint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = 0;
         value |= (uint32_t)((int32_t)MIN2(src[0], 2147483647)) ;
         *(uint32_t *)dst = value;
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_l32_sint_unpack_signed(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   int *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
         uint32_t value = *(const uint32_t *)src;
         int32_t rgb = (int32_t)(value) ;
         dst[0] = rgb; /* r */
         dst[1] = rgb; /* g */
         dst[2] = rgb; /* b */
         dst[3] = 1; /* a */
      src += 4;
      dst += 4;
   }
}

void
util_format_l32_sint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = 0;
         value |= (uint32_t)(src[0]) ;
         *(uint32_t *)dst = value;
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_l32_sint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   int *dst = in_dst;
         uint32_t value = *(const uint32_t *)src;
         int32_t rgb = (int32_t)(value) ;
         dst[0] = rgb; /* r */
         dst[1] = rgb; /* g */
         dst[2] = rgb; /* b */
         dst[3] = 1; /* a */
}

void
util_format_l32_sint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = 0;
         value |= (uint32_t)((int32_t)MIN2(src[0], 2147483647)) ;
         *(uint32_t *)dst = value;
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_l32a32_sint {
#if UTIL_ARCH_BIG_ENDIAN
   int32_t rgb;
   int32_t a;
#else
   int32_t rgb;
   int32_t a;
#endif
};

void
util_format_l32a32_sint_unpack_signed(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   int *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_l32a32_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.rgb; /* r */
         dst[1] = pixel.rgb; /* g */
         dst[2] = pixel.rgb; /* b */
         dst[3] = pixel.a; /* a */
#else
         struct util_format_l32a32_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.rgb; /* r */
         dst[1] = pixel.rgb; /* g */
         dst[2] = pixel.rgb; /* b */
         dst[3] = pixel.a; /* a */
#endif
      src += 8;
      dst += 4;
   }
}

void
util_format_l32a32_sint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_l32a32_sint pixel = {0};
         pixel.rgb = src[0];
         pixel.a = src[3];
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_l32a32_sint pixel = {0};
         pixel.rgb = src[0];
         pixel.a = src[3];
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_l32a32_sint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   int *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_l32a32_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.rgb; /* r */
         dst[1] = pixel.rgb; /* g */
         dst[2] = pixel.rgb; /* b */
         dst[3] = pixel.a; /* a */
#else
         struct util_format_l32a32_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.rgb; /* r */
         dst[1] = pixel.rgb; /* g */
         dst[2] = pixel.rgb; /* b */
         dst[3] = pixel.a; /* a */
#endif
}

void
util_format_l32a32_sint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_l32a32_sint pixel = {0};
         pixel.rgb = (int32_t)MIN2(src[0], 2147483647);
         pixel.a = (int32_t)MIN2(src[3], 2147483647);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_l32a32_sint pixel = {0};
         pixel.rgb = (int32_t)MIN2(src[0], 2147483647);
         pixel.a = (int32_t)MIN2(src[3], 2147483647);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_b8g8r8_uint {
#if UTIL_ARCH_BIG_ENDIAN
   uint8_t b;
   uint8_t g;
   uint8_t r;
#else
   uint8_t b;
   uint8_t g;
   uint8_t r;
#endif
};

void
util_format_b8g8r8_uint_unpack_unsigned(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   unsigned *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)pixel.r; /* r */
         dst[1] = (unsigned)pixel.g; /* g */
         dst[2] = (unsigned)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_b8g8r8_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)pixel.r; /* r */
         dst[1] = (unsigned)pixel.g; /* g */
         dst[2] = (unsigned)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
      src += 3;
      dst += 4;
   }
}

void
util_format_b8g8r8_uint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_uint pixel = {0};
         pixel.b = (uint8_t)MIN2(src[2], 255);
         pixel.g = (uint8_t)MIN2(src[1], 255);
         pixel.r = (uint8_t)MIN2(src[0], 255);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_b8g8r8_uint pixel = {0};
         pixel.b = (uint8_t)MIN2(src[2], 255);
         pixel.g = (uint8_t)MIN2(src[1], 255);
         pixel.r = (uint8_t)MIN2(src[0], 255);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 3;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b8g8r8_uint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   unsigned *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)pixel.r; /* r */
         dst[1] = (unsigned)pixel.g; /* g */
         dst[2] = (unsigned)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_b8g8r8_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)pixel.r; /* r */
         dst[1] = (unsigned)pixel.g; /* g */
         dst[2] = (unsigned)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_b8g8r8_uint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_uint pixel = {0};
         pixel.b = (uint8_t)CLAMP(src[2], 0, 255);
         pixel.g = (uint8_t)CLAMP(src[1], 0, 255);
         pixel.r = (uint8_t)CLAMP(src[0], 0, 255);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_b8g8r8_uint pixel = {0};
         pixel.b = (uint8_t)CLAMP(src[2], 0, 255);
         pixel.g = (uint8_t)CLAMP(src[1], 0, 255);
         pixel.r = (uint8_t)CLAMP(src[0], 0, 255);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 3;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b8g8r8a8_uint_unpack_unsigned(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   unsigned *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = value >> 24;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t r = (value >> 8) & 0xff;
         uint32_t a = (value) & 0xff;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = (value) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t r = (value >> 16) & 0xff;
         uint32_t a = value >> 24;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_b8g8r8a8_uint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint8_t)MIN2(src[2], 255)) << 24;
         value |= (uint32_t)(((uint8_t)MIN2(src[1], 255)) & 0xff) << 16;
         value |= (uint32_t)(((uint8_t)MIN2(src[0], 255)) & 0xff) << 8;
         value |= ((uint8_t)MIN2(src[3], 255)) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint8_t)MIN2(src[2], 255)) & 0xff;
         value |= (uint32_t)(((uint8_t)MIN2(src[1], 255)) & 0xff) << 8;
         value |= (uint32_t)(((uint8_t)MIN2(src[0], 255)) & 0xff) << 16;
         value |= (uint32_t)((uint8_t)MIN2(src[3], 255)) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b8g8r8a8_uint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   unsigned *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = value >> 24;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t r = (value >> 8) & 0xff;
         uint32_t a = (value) & 0xff;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = (value) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t r = (value >> 16) & 0xff;
         uint32_t a = value >> 24;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#endif
}

void
util_format_b8g8r8a8_uint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint8_t)CLAMP(src[2], 0, 255)) << 24;
         value |= (uint32_t)(((uint8_t)CLAMP(src[1], 0, 255)) & 0xff) << 16;
         value |= (uint32_t)(((uint8_t)CLAMP(src[0], 0, 255)) & 0xff) << 8;
         value |= ((uint8_t)CLAMP(src[3], 0, 255)) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint8_t)CLAMP(src[2], 0, 255)) & 0xff;
         value |= (uint32_t)(((uint8_t)CLAMP(src[1], 0, 255)) & 0xff) << 8;
         value |= (uint32_t)(((uint8_t)CLAMP(src[0], 0, 255)) & 0xff) << 16;
         value |= (uint32_t)((uint8_t)CLAMP(src[3], 0, 255)) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_b8g8r8_sint {
#if UTIL_ARCH_BIG_ENDIAN
   int8_t b;
   int8_t g;
   int8_t r;
#else
   int8_t b;
   int8_t g;
   int8_t r;
#endif
};

void
util_format_b8g8r8_sint_unpack_signed(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   int *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (int)pixel.r; /* r */
         dst[1] = (int)pixel.g; /* g */
         dst[2] = (int)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_b8g8r8_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (int)pixel.r; /* r */
         dst[1] = (int)pixel.g; /* g */
         dst[2] = (int)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
      src += 3;
      dst += 4;
   }
}

void
util_format_b8g8r8_sint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_sint pixel = {0};
         pixel.b = (int8_t)CLAMP(src[2], -128, 127);
         pixel.g = (int8_t)CLAMP(src[1], -128, 127);
         pixel.r = (int8_t)CLAMP(src[0], -128, 127);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_b8g8r8_sint pixel = {0};
         pixel.b = (int8_t)CLAMP(src[2], -128, 127);
         pixel.g = (int8_t)CLAMP(src[1], -128, 127);
         pixel.r = (int8_t)CLAMP(src[0], -128, 127);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 3;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b8g8r8_sint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   int *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (int)pixel.r; /* r */
         dst[1] = (int)pixel.g; /* g */
         dst[2] = (int)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_b8g8r8_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (int)pixel.r; /* r */
         dst[1] = (int)pixel.g; /* g */
         dst[2] = (int)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_b8g8r8_sint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_sint pixel = {0};
         pixel.b = (int8_t)MIN2(src[2], 127);
         pixel.g = (int8_t)MIN2(src[1], 127);
         pixel.r = (int8_t)MIN2(src[0], 127);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_b8g8r8_sint pixel = {0};
         pixel.b = (int8_t)MIN2(src[2], 127);
         pixel.g = (int8_t)MIN2(src[1], 127);
         pixel.r = (int8_t)MIN2(src[0], 127);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 3;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b8g8r8a8_sint_unpack_signed(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   int *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t b = ((int32_t)(value) ) >> 24;
         int32_t g = ((int32_t)(value << 8) ) >> 24;
         int32_t r = ((int32_t)(value << 16) ) >> 24;
         int32_t a = ((int32_t)(value << 24) ) >> 24;
         dst[0] = (int)r; /* r */
         dst[1] = (int)g; /* g */
         dst[2] = (int)b; /* b */
         dst[3] = (int)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t b = ((int32_t)(value << 24) ) >> 24;
         int32_t g = ((int32_t)(value << 16) ) >> 24;
         int32_t r = ((int32_t)(value << 8) ) >> 24;
         int32_t a = ((int32_t)(value) ) >> 24;
         dst[0] = (int)r; /* r */
         dst[1] = (int)g; /* g */
         dst[2] = (int)b; /* b */
         dst[3] = (int)a; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_b8g8r8a8_sint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((int8_t)CLAMP(src[2], -128, 127)) << 24) ;
         value |= (uint32_t)((uint32_t)(((int8_t)CLAMP(src[1], -128, 127)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)(((int8_t)CLAMP(src[0], -128, 127)) & 0xff) << 8) ;
         value |= (uint32_t)(((int8_t)CLAMP(src[3], -128, 127)) & 0xff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((int8_t)CLAMP(src[2], -128, 127)) & 0xff) ;
         value |= (uint32_t)((uint32_t)(((int8_t)CLAMP(src[1], -128, 127)) & 0xff) << 8) ;
         value |= (uint32_t)((uint32_t)(((int8_t)CLAMP(src[0], -128, 127)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)((int8_t)CLAMP(src[3], -128, 127)) << 24) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b8g8r8a8_sint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   int *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t b = ((int32_t)(value) ) >> 24;
         int32_t g = ((int32_t)(value << 8) ) >> 24;
         int32_t r = ((int32_t)(value << 16) ) >> 24;
         int32_t a = ((int32_t)(value << 24) ) >> 24;
         dst[0] = (int)r; /* r */
         dst[1] = (int)g; /* g */
         dst[2] = (int)b; /* b */
         dst[3] = (int)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t b = ((int32_t)(value << 24) ) >> 24;
         int32_t g = ((int32_t)(value << 16) ) >> 24;
         int32_t r = ((int32_t)(value << 8) ) >> 24;
         int32_t a = ((int32_t)(value) ) >> 24;
         dst[0] = (int)r; /* r */
         dst[1] = (int)g; /* g */
         dst[2] = (int)b; /* b */
         dst[3] = (int)a; /* a */
#endif
}

void
util_format_b8g8r8a8_sint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((int8_t)MIN2(src[2], 127)) << 24) ;
         value |= (uint32_t)((uint32_t)(((int8_t)MIN2(src[1], 127)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)(((int8_t)MIN2(src[0], 127)) & 0xff) << 8) ;
         value |= (uint32_t)(((int8_t)MIN2(src[3], 127)) & 0xff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((int8_t)MIN2(src[2], 127)) & 0xff) ;
         value |= (uint32_t)((uint32_t)(((int8_t)MIN2(src[1], 127)) & 0xff) << 8) ;
         value |= (uint32_t)((uint32_t)(((int8_t)MIN2(src[0], 127)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)((int8_t)MIN2(src[3], 127)) << 24) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a8r8g8b8_uint_unpack_unsigned(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   unsigned *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = value >> 24;
         uint32_t r = (value >> 16) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t b = (value) & 0xff;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = (value) & 0xff;
         uint32_t r = (value >> 8) & 0xff;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t b = value >> 24;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_a8r8g8b8_uint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint8_t)MIN2(src[3], 255)) << 24;
         value |= (uint32_t)(((uint8_t)MIN2(src[0], 255)) & 0xff) << 16;
         value |= (uint32_t)(((uint8_t)MIN2(src[1], 255)) & 0xff) << 8;
         value |= ((uint8_t)MIN2(src[2], 255)) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint8_t)MIN2(src[3], 255)) & 0xff;
         value |= (uint32_t)(((uint8_t)MIN2(src[0], 255)) & 0xff) << 8;
         value |= (uint32_t)(((uint8_t)MIN2(src[1], 255)) & 0xff) << 16;
         value |= (uint32_t)((uint8_t)MIN2(src[2], 255)) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a8r8g8b8_uint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   unsigned *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = value >> 24;
         uint32_t r = (value >> 16) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t b = (value) & 0xff;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = (value) & 0xff;
         uint32_t r = (value >> 8) & 0xff;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t b = value >> 24;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#endif
}

void
util_format_a8r8g8b8_uint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint8_t)CLAMP(src[3], 0, 255)) << 24;
         value |= (uint32_t)(((uint8_t)CLAMP(src[0], 0, 255)) & 0xff) << 16;
         value |= (uint32_t)(((uint8_t)CLAMP(src[1], 0, 255)) & 0xff) << 8;
         value |= ((uint8_t)CLAMP(src[2], 0, 255)) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint8_t)CLAMP(src[3], 0, 255)) & 0xff;
         value |= (uint32_t)(((uint8_t)CLAMP(src[0], 0, 255)) & 0xff) << 8;
         value |= (uint32_t)(((uint8_t)CLAMP(src[1], 0, 255)) & 0xff) << 16;
         value |= (uint32_t)((uint8_t)CLAMP(src[2], 0, 255)) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a8b8g8r8_uint_unpack_unsigned(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   unsigned *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = value >> 24;
         uint32_t b = (value >> 16) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t r = (value) & 0xff;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = (value) & 0xff;
         uint32_t b = (value >> 8) & 0xff;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t r = value >> 24;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_a8b8g8r8_uint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint8_t)MIN2(src[3], 255)) << 24;
         value |= (uint32_t)(((uint8_t)MIN2(src[2], 255)) & 0xff) << 16;
         value |= (uint32_t)(((uint8_t)MIN2(src[1], 255)) & 0xff) << 8;
         value |= ((uint8_t)MIN2(src[0], 255)) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint8_t)MIN2(src[3], 255)) & 0xff;
         value |= (uint32_t)(((uint8_t)MIN2(src[2], 255)) & 0xff) << 8;
         value |= (uint32_t)(((uint8_t)MIN2(src[1], 255)) & 0xff) << 16;
         value |= (uint32_t)((uint8_t)MIN2(src[0], 255)) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a8b8g8r8_uint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   unsigned *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = value >> 24;
         uint32_t b = (value >> 16) & 0xff;
         uint32_t g = (value >> 8) & 0xff;
         uint32_t r = (value) & 0xff;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = (value) & 0xff;
         uint32_t b = (value >> 8) & 0xff;
         uint32_t g = (value >> 16) & 0xff;
         uint32_t r = value >> 24;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#endif
}

void
util_format_a8b8g8r8_uint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint8_t)CLAMP(src[3], 0, 255)) << 24;
         value |= (uint32_t)(((uint8_t)CLAMP(src[2], 0, 255)) & 0xff) << 16;
         value |= (uint32_t)(((uint8_t)CLAMP(src[1], 0, 255)) & 0xff) << 8;
         value |= ((uint8_t)CLAMP(src[0], 0, 255)) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint8_t)CLAMP(src[3], 0, 255)) & 0xff;
         value |= (uint32_t)(((uint8_t)CLAMP(src[2], 0, 255)) & 0xff) << 8;
         value |= (uint32_t)(((uint8_t)CLAMP(src[1], 0, 255)) & 0xff) << 16;
         value |= (uint32_t)((uint8_t)CLAMP(src[0], 0, 255)) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a2r10g10b10_uint_unpack_unsigned(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   unsigned *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = value >> 22;
         uint32_t g = (value >> 12) & 0x3ff;
         uint32_t r = (value >> 2) & 0x3ff;
         uint32_t a = (value) & 0x3;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = (value) & 0x3;
         uint32_t r = (value >> 2) & 0x3ff;
         uint32_t g = (value >> 12) & 0x3ff;
         uint32_t b = value >> 22;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_a2r10g10b10_uint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)MIN2(src[2], 1023)) << 22;
         value |= (uint32_t)(((uint32_t)MIN2(src[1], 1023)) & 0x3ff) << 12;
         value |= (uint32_t)(((uint32_t)MIN2(src[0], 1023)) & 0x3ff) << 2;
         value |= ((uint32_t)MIN2(src[3], 3)) & 0x3;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint32_t)MIN2(src[3], 3)) & 0x3;
         value |= (uint32_t)(((uint32_t)MIN2(src[0], 1023)) & 0x3ff) << 2;
         value |= (uint32_t)(((uint32_t)MIN2(src[1], 1023)) & 0x3ff) << 12;
         value |= (uint32_t)((uint32_t)MIN2(src[2], 1023)) << 22;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a2r10g10b10_uint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   unsigned *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = value >> 22;
         uint32_t g = (value >> 12) & 0x3ff;
         uint32_t r = (value >> 2) & 0x3ff;
         uint32_t a = (value) & 0x3;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = (value) & 0x3;
         uint32_t r = (value >> 2) & 0x3ff;
         uint32_t g = (value >> 12) & 0x3ff;
         uint32_t b = value >> 22;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#endif
}

void
util_format_a2r10g10b10_uint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)CLAMP(src[2], 0, 1023)) << 22;
         value |= (uint32_t)(((uint32_t)CLAMP(src[1], 0, 1023)) & 0x3ff) << 12;
         value |= (uint32_t)(((uint32_t)CLAMP(src[0], 0, 1023)) & 0x3ff) << 2;
         value |= ((uint32_t)CLAMP(src[3], 0, 3)) & 0x3;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint32_t)CLAMP(src[3], 0, 3)) & 0x3;
         value |= (uint32_t)(((uint32_t)CLAMP(src[0], 0, 1023)) & 0x3ff) << 2;
         value |= (uint32_t)(((uint32_t)CLAMP(src[1], 0, 1023)) & 0x3ff) << 12;
         value |= (uint32_t)((uint32_t)CLAMP(src[2], 0, 1023)) << 22;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a2b10g10r10_uint_unpack_unsigned(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   unsigned *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = value >> 22;
         uint32_t g = (value >> 12) & 0x3ff;
         uint32_t b = (value >> 2) & 0x3ff;
         uint32_t a = (value) & 0x3;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = (value) & 0x3;
         uint32_t b = (value >> 2) & 0x3ff;
         uint32_t g = (value >> 12) & 0x3ff;
         uint32_t r = value >> 22;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_a2b10g10r10_uint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)MIN2(src[0], 1023)) << 22;
         value |= (uint32_t)(((uint32_t)MIN2(src[1], 1023)) & 0x3ff) << 12;
         value |= (uint32_t)(((uint32_t)MIN2(src[2], 1023)) & 0x3ff) << 2;
         value |= ((uint32_t)MIN2(src[3], 3)) & 0x3;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint32_t)MIN2(src[3], 3)) & 0x3;
         value |= (uint32_t)(((uint32_t)MIN2(src[2], 1023)) & 0x3ff) << 2;
         value |= (uint32_t)(((uint32_t)MIN2(src[1], 1023)) & 0x3ff) << 12;
         value |= (uint32_t)((uint32_t)MIN2(src[0], 1023)) << 22;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a2b10g10r10_uint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   unsigned *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r = value >> 22;
         uint32_t g = (value >> 12) & 0x3ff;
         uint32_t b = (value >> 2) & 0x3ff;
         uint32_t a = (value) & 0x3;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = (value) & 0x3;
         uint32_t b = (value >> 2) & 0x3ff;
         uint32_t g = (value >> 12) & 0x3ff;
         uint32_t r = value >> 22;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#endif
}

void
util_format_a2b10g10r10_uint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)CLAMP(src[0], 0, 1023)) << 22;
         value |= (uint32_t)(((uint32_t)CLAMP(src[1], 0, 1023)) & 0x3ff) << 12;
         value |= (uint32_t)(((uint32_t)CLAMP(src[2], 0, 1023)) & 0x3ff) << 2;
         value |= ((uint32_t)CLAMP(src[3], 0, 3)) & 0x3;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint32_t)CLAMP(src[3], 0, 3)) & 0x3;
         value |= (uint32_t)(((uint32_t)CLAMP(src[2], 0, 1023)) & 0x3ff) << 2;
         value |= (uint32_t)(((uint32_t)CLAMP(src[1], 0, 1023)) & 0x3ff) << 12;
         value |= (uint32_t)((uint32_t)CLAMP(src[0], 0, 1023)) << 22;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b10g10r10a2_uint_unpack_unsigned(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   unsigned *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = value >> 30;
         uint32_t r = (value >> 20) & 0x3ff;
         uint32_t g = (value >> 10) & 0x3ff;
         uint32_t b = (value) & 0x3ff;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = (value) & 0x3ff;
         uint32_t g = (value >> 10) & 0x3ff;
         uint32_t r = (value >> 20) & 0x3ff;
         uint32_t a = value >> 30;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_b10g10r10a2_uint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)MIN2(src[3], 3)) << 30;
         value |= (uint32_t)(((uint32_t)MIN2(src[0], 1023)) & 0x3ff) << 20;
         value |= (uint32_t)(((uint32_t)MIN2(src[1], 1023)) & 0x3ff) << 10;
         value |= ((uint32_t)MIN2(src[2], 1023)) & 0x3ff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint32_t)MIN2(src[2], 1023)) & 0x3ff;
         value |= (uint32_t)(((uint32_t)MIN2(src[1], 1023)) & 0x3ff) << 10;
         value |= (uint32_t)(((uint32_t)MIN2(src[0], 1023)) & 0x3ff) << 20;
         value |= (uint32_t)((uint32_t)MIN2(src[3], 3)) << 30;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b10g10r10a2_uint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   unsigned *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a = value >> 30;
         uint32_t r = (value >> 20) & 0x3ff;
         uint32_t g = (value >> 10) & 0x3ff;
         uint32_t b = (value) & 0x3ff;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b = (value) & 0x3ff;
         uint32_t g = (value >> 10) & 0x3ff;
         uint32_t r = (value >> 20) & 0x3ff;
         uint32_t a = value >> 30;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#endif
}

void
util_format_b10g10r10a2_uint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)CLAMP(src[3], 0, 3)) << 30;
         value |= (uint32_t)(((uint32_t)CLAMP(src[0], 0, 1023)) & 0x3ff) << 20;
         value |= (uint32_t)(((uint32_t)CLAMP(src[1], 0, 1023)) & 0x3ff) << 10;
         value |= ((uint32_t)CLAMP(src[2], 0, 1023)) & 0x3ff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint32_t)CLAMP(src[2], 0, 1023)) & 0x3ff;
         value |= (uint32_t)(((uint32_t)CLAMP(src[1], 0, 1023)) & 0x3ff) << 10;
         value |= (uint32_t)(((uint32_t)CLAMP(src[0], 0, 1023)) & 0x3ff) << 20;
         value |= (uint32_t)((uint32_t)CLAMP(src[3], 0, 3)) << 30;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b10g10r10a2_sint_unpack_signed(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   int *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t a = ((int32_t)(value) ) >> 30;
         int32_t r = ((int32_t)(value << 2) ) >> 22;
         int32_t g = ((int32_t)(value << 12) ) >> 22;
         int32_t b = ((int32_t)(value << 22) ) >> 22;
         dst[0] = (int)r; /* r */
         dst[1] = (int)g; /* g */
         dst[2] = (int)b; /* b */
         dst[3] = (int)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t b = ((int32_t)(value << 22) ) >> 22;
         int32_t g = ((int32_t)(value << 12) ) >> 22;
         int32_t r = ((int32_t)(value << 2) ) >> 22;
         int32_t a = ((int32_t)(value) ) >> 30;
         dst[0] = (int)r; /* r */
         dst[1] = (int)g; /* g */
         dst[2] = (int)b; /* b */
         dst[3] = (int)a; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_b10g10r10a2_sint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((uint32_t)CLAMP(src[3], -2, 1)) << 30) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)CLAMP(src[0], -512, 511)) & 0x3ff) << 20) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)CLAMP(src[1], -512, 511)) & 0x3ff) << 10) ;
         value |= (uint32_t)(((uint32_t)CLAMP(src[2], -512, 511)) & 0x3ff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((uint32_t)CLAMP(src[2], -512, 511)) & 0x3ff) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)CLAMP(src[1], -512, 511)) & 0x3ff) << 10) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)CLAMP(src[0], -512, 511)) & 0x3ff) << 20) ;
         value |= (uint32_t)((uint32_t)((uint32_t)CLAMP(src[3], -2, 1)) << 30) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b10g10r10a2_sint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   int *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t a = ((int32_t)(value) ) >> 30;
         int32_t r = ((int32_t)(value << 2) ) >> 22;
         int32_t g = ((int32_t)(value << 12) ) >> 22;
         int32_t b = ((int32_t)(value << 22) ) >> 22;
         dst[0] = (int)r; /* r */
         dst[1] = (int)g; /* g */
         dst[2] = (int)b; /* b */
         dst[3] = (int)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t b = ((int32_t)(value << 22) ) >> 22;
         int32_t g = ((int32_t)(value << 12) ) >> 22;
         int32_t r = ((int32_t)(value << 2) ) >> 22;
         int32_t a = ((int32_t)(value) ) >> 30;
         dst[0] = (int)r; /* r */
         dst[1] = (int)g; /* g */
         dst[2] = (int)b; /* b */
         dst[3] = (int)a; /* a */
#endif
}

void
util_format_b10g10r10a2_sint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((uint32_t)MIN2(src[3], 1)) << 30) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)MIN2(src[0], 511)) & 0x3ff) << 20) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)MIN2(src[1], 511)) & 0x3ff) << 10) ;
         value |= (uint32_t)(((uint32_t)MIN2(src[2], 511)) & 0x3ff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((uint32_t)MIN2(src[2], 511)) & 0x3ff) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)MIN2(src[1], 511)) & 0x3ff) << 10) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)MIN2(src[0], 511)) & 0x3ff) << 20) ;
         value |= (uint32_t)((uint32_t)((uint32_t)MIN2(src[3], 1)) << 30) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r5g6b5_uint_unpack_unsigned(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   unsigned *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t b = value >> 11;
         uint16_t g = (value >> 5) & 0x3f;
         uint16_t r = (value) & 0x1f;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = (value) & 0x1f;
         uint16_t g = (value >> 5) & 0x3f;
         uint16_t b = value >> 11;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = 1; /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_r5g6b5_uint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint16_t)MIN2(src[2], 31)) << 11;
         value |= (uint32_t)(((uint16_t)MIN2(src[1], 63)) & 0x3f) << 5;
         value |= ((uint16_t)MIN2(src[0], 31)) & 0x1f;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint16_t)MIN2(src[0], 31)) & 0x1f;
         value |= (uint32_t)(((uint16_t)MIN2(src[1], 63)) & 0x3f) << 5;
         value |= (uint32_t)((uint16_t)MIN2(src[2], 31)) << 11;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r5g6b5_uint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   unsigned *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t b = value >> 11;
         uint16_t g = (value >> 5) & 0x3f;
         uint16_t r = (value) & 0x1f;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = (value) & 0x1f;
         uint16_t g = (value >> 5) & 0x3f;
         uint16_t b = value >> 11;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_r5g6b5_uint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint16_t)CLAMP(src[2], 0, 31)) << 11;
         value |= (uint32_t)(((uint16_t)CLAMP(src[1], 0, 63)) & 0x3f) << 5;
         value |= ((uint16_t)CLAMP(src[0], 0, 31)) & 0x1f;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint16_t)CLAMP(src[0], 0, 31)) & 0x1f;
         value |= (uint32_t)(((uint16_t)CLAMP(src[1], 0, 63)) & 0x3f) << 5;
         value |= (uint32_t)((uint16_t)CLAMP(src[2], 0, 31)) << 11;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b5g6r5_uint_unpack_unsigned(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   unsigned *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = value >> 11;
         uint16_t g = (value >> 5) & 0x3f;
         uint16_t b = (value) & 0x1f;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t b = (value) & 0x1f;
         uint16_t g = (value >> 5) & 0x3f;
         uint16_t r = value >> 11;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = 1; /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_b5g6r5_uint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint16_t)MIN2(src[0], 31)) << 11;
         value |= (uint32_t)(((uint16_t)MIN2(src[1], 63)) & 0x3f) << 5;
         value |= ((uint16_t)MIN2(src[2], 31)) & 0x1f;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint16_t)MIN2(src[2], 31)) & 0x1f;
         value |= (uint32_t)(((uint16_t)MIN2(src[1], 63)) & 0x3f) << 5;
         value |= (uint32_t)((uint16_t)MIN2(src[0], 31)) << 11;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b5g6r5_uint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   unsigned *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = value >> 11;
         uint16_t g = (value >> 5) & 0x3f;
         uint16_t b = (value) & 0x1f;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t b = (value) & 0x1f;
         uint16_t g = (value >> 5) & 0x3f;
         uint16_t r = value >> 11;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_b5g6r5_uint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint16_t)CLAMP(src[0], 0, 31)) << 11;
         value |= (uint32_t)(((uint16_t)CLAMP(src[1], 0, 63)) & 0x3f) << 5;
         value |= ((uint16_t)CLAMP(src[2], 0, 31)) & 0x1f;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint16_t)CLAMP(src[2], 0, 31)) & 0x1f;
         value |= (uint32_t)(((uint16_t)CLAMP(src[1], 0, 63)) & 0x3f) << 5;
         value |= (uint32_t)((uint16_t)CLAMP(src[0], 0, 31)) << 11;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r3g3b2_uint_unpack_unsigned(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   unsigned *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint8_t value = *(const uint8_t *)src;
         uint8_t b = value >> 6;
         uint8_t g = (value >> 3) & 0x7;
         uint8_t r = (value) & 0x7;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = 1; /* a */
#else
         uint8_t value = *(const uint8_t *)src;
         uint8_t r = (value) & 0x7;
         uint8_t g = (value >> 3) & 0x7;
         uint8_t b = value >> 6;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = 1; /* a */
#endif
      src += 1;
      dst += 4;
   }
}

void
util_format_r3g3b2_uint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint8_t value = 0;
         value |= (uint32_t)((uint8_t)MIN2(src[2], 3)) << 6;
         value |= (uint32_t)(((uint8_t)MIN2(src[1], 7)) & 0x7) << 3;
         value |= ((uint8_t)MIN2(src[0], 7)) & 0x7;
         *(uint8_t *)dst = value;
#else
         uint8_t value = 0;
         value |= ((uint8_t)MIN2(src[0], 7)) & 0x7;
         value |= (uint32_t)(((uint8_t)MIN2(src[1], 7)) & 0x7) << 3;
         value |= (uint32_t)((uint8_t)MIN2(src[2], 3)) << 6;
         *(uint8_t *)dst = value;
#endif
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r3g3b2_uint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   unsigned *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint8_t value = *(const uint8_t *)src;
         uint8_t b = value >> 6;
         uint8_t g = (value >> 3) & 0x7;
         uint8_t r = (value) & 0x7;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = 1; /* a */
#else
         uint8_t value = *(const uint8_t *)src;
         uint8_t r = (value) & 0x7;
         uint8_t g = (value >> 3) & 0x7;
         uint8_t b = value >> 6;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_r3g3b2_uint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint8_t value = 0;
         value |= (uint32_t)((uint8_t)CLAMP(src[2], 0, 3)) << 6;
         value |= (uint32_t)(((uint8_t)CLAMP(src[1], 0, 7)) & 0x7) << 3;
         value |= ((uint8_t)CLAMP(src[0], 0, 7)) & 0x7;
         *(uint8_t *)dst = value;
#else
         uint8_t value = 0;
         value |= ((uint8_t)CLAMP(src[0], 0, 7)) & 0x7;
         value |= (uint32_t)(((uint8_t)CLAMP(src[1], 0, 7)) & 0x7) << 3;
         value |= (uint32_t)((uint8_t)CLAMP(src[2], 0, 3)) << 6;
         *(uint8_t *)dst = value;
#endif
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b2g3r3_uint_unpack_unsigned(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   unsigned *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint8_t value = *(const uint8_t *)src;
         uint8_t r = value >> 5;
         uint8_t g = (value >> 2) & 0x7;
         uint8_t b = (value) & 0x3;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = 1; /* a */
#else
         uint8_t value = *(const uint8_t *)src;
         uint8_t b = (value) & 0x3;
         uint8_t g = (value >> 2) & 0x7;
         uint8_t r = value >> 5;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = 1; /* a */
#endif
      src += 1;
      dst += 4;
   }
}

void
util_format_b2g3r3_uint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint8_t value = 0;
         value |= (uint32_t)((uint8_t)MIN2(src[0], 7)) << 5;
         value |= (uint32_t)(((uint8_t)MIN2(src[1], 7)) & 0x7) << 2;
         value |= ((uint8_t)MIN2(src[2], 3)) & 0x3;
         *(uint8_t *)dst = value;
#else
         uint8_t value = 0;
         value |= ((uint8_t)MIN2(src[2], 3)) & 0x3;
         value |= (uint32_t)(((uint8_t)MIN2(src[1], 7)) & 0x7) << 2;
         value |= (uint32_t)((uint8_t)MIN2(src[0], 7)) << 5;
         *(uint8_t *)dst = value;
#endif
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b2g3r3_uint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   unsigned *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint8_t value = *(const uint8_t *)src;
         uint8_t r = value >> 5;
         uint8_t g = (value >> 2) & 0x7;
         uint8_t b = (value) & 0x3;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = 1; /* a */
#else
         uint8_t value = *(const uint8_t *)src;
         uint8_t b = (value) & 0x3;
         uint8_t g = (value >> 2) & 0x7;
         uint8_t r = value >> 5;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = 1; /* a */
#endif
}

void
util_format_b2g3r3_uint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint8_t value = 0;
         value |= (uint32_t)((uint8_t)CLAMP(src[0], 0, 7)) << 5;
         value |= (uint32_t)(((uint8_t)CLAMP(src[1], 0, 7)) & 0x7) << 2;
         value |= ((uint8_t)CLAMP(src[2], 0, 3)) & 0x3;
         *(uint8_t *)dst = value;
#else
         uint8_t value = 0;
         value |= ((uint8_t)CLAMP(src[2], 0, 3)) & 0x3;
         value |= (uint32_t)(((uint8_t)CLAMP(src[1], 0, 7)) & 0x7) << 2;
         value |= (uint32_t)((uint8_t)CLAMP(src[0], 0, 7)) << 5;
         *(uint8_t *)dst = value;
#endif
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r4g4b4a4_uint_unpack_unsigned(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   unsigned *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t a = value >> 12;
         uint16_t b = (value >> 8) & 0xf;
         uint16_t g = (value >> 4) & 0xf;
         uint16_t r = (value) & 0xf;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = (value) & 0xf;
         uint16_t g = (value >> 4) & 0xf;
         uint16_t b = (value >> 8) & 0xf;
         uint16_t a = value >> 12;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_r4g4b4a4_uint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint16_t)MIN2(src[3], 15)) << 12;
         value |= (uint32_t)(((uint16_t)MIN2(src[2], 15)) & 0xf) << 8;
         value |= (uint32_t)(((uint16_t)MIN2(src[1], 15)) & 0xf) << 4;
         value |= ((uint16_t)MIN2(src[0], 15)) & 0xf;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint16_t)MIN2(src[0], 15)) & 0xf;
         value |= (uint32_t)(((uint16_t)MIN2(src[1], 15)) & 0xf) << 4;
         value |= (uint32_t)(((uint16_t)MIN2(src[2], 15)) & 0xf) << 8;
         value |= (uint32_t)((uint16_t)MIN2(src[3], 15)) << 12;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r4g4b4a4_uint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   unsigned *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t a = value >> 12;
         uint16_t b = (value >> 8) & 0xf;
         uint16_t g = (value >> 4) & 0xf;
         uint16_t r = (value) & 0xf;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = (value) & 0xf;
         uint16_t g = (value >> 4) & 0xf;
         uint16_t b = (value >> 8) & 0xf;
         uint16_t a = value >> 12;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#endif
}

void
util_format_r4g4b4a4_uint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint16_t)CLAMP(src[3], 0, 15)) << 12;
         value |= (uint32_t)(((uint16_t)CLAMP(src[2], 0, 15)) & 0xf) << 8;
         value |= (uint32_t)(((uint16_t)CLAMP(src[1], 0, 15)) & 0xf) << 4;
         value |= ((uint16_t)CLAMP(src[0], 0, 15)) & 0xf;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint16_t)CLAMP(src[0], 0, 15)) & 0xf;
         value |= (uint32_t)(((uint16_t)CLAMP(src[1], 0, 15)) & 0xf) << 4;
         value |= (uint32_t)(((uint16_t)CLAMP(src[2], 0, 15)) & 0xf) << 8;
         value |= (uint32_t)((uint16_t)CLAMP(src[3], 0, 15)) << 12;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b4g4r4a4_uint_unpack_unsigned(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   unsigned *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t a = value >> 12;
         uint16_t r = (value >> 8) & 0xf;
         uint16_t g = (value >> 4) & 0xf;
         uint16_t b = (value) & 0xf;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t b = (value) & 0xf;
         uint16_t g = (value >> 4) & 0xf;
         uint16_t r = (value >> 8) & 0xf;
         uint16_t a = value >> 12;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_b4g4r4a4_uint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint16_t)MIN2(src[3], 15)) << 12;
         value |= (uint32_t)(((uint16_t)MIN2(src[0], 15)) & 0xf) << 8;
         value |= (uint32_t)(((uint16_t)MIN2(src[1], 15)) & 0xf) << 4;
         value |= ((uint16_t)MIN2(src[2], 15)) & 0xf;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint16_t)MIN2(src[2], 15)) & 0xf;
         value |= (uint32_t)(((uint16_t)MIN2(src[1], 15)) & 0xf) << 4;
         value |= (uint32_t)(((uint16_t)MIN2(src[0], 15)) & 0xf) << 8;
         value |= (uint32_t)((uint16_t)MIN2(src[3], 15)) << 12;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b4g4r4a4_uint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   unsigned *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t a = value >> 12;
         uint16_t r = (value >> 8) & 0xf;
         uint16_t g = (value >> 4) & 0xf;
         uint16_t b = (value) & 0xf;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t b = (value) & 0xf;
         uint16_t g = (value >> 4) & 0xf;
         uint16_t r = (value >> 8) & 0xf;
         uint16_t a = value >> 12;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#endif
}

void
util_format_b4g4r4a4_uint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint16_t)CLAMP(src[3], 0, 15)) << 12;
         value |= (uint32_t)(((uint16_t)CLAMP(src[0], 0, 15)) & 0xf) << 8;
         value |= (uint32_t)(((uint16_t)CLAMP(src[1], 0, 15)) & 0xf) << 4;
         value |= ((uint16_t)CLAMP(src[2], 0, 15)) & 0xf;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint16_t)CLAMP(src[2], 0, 15)) & 0xf;
         value |= (uint32_t)(((uint16_t)CLAMP(src[1], 0, 15)) & 0xf) << 4;
         value |= (uint32_t)(((uint16_t)CLAMP(src[0], 0, 15)) & 0xf) << 8;
         value |= (uint32_t)((uint16_t)CLAMP(src[3], 0, 15)) << 12;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a4r4g4b4_uint_unpack_unsigned(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   unsigned *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t b = value >> 12;
         uint16_t g = (value >> 8) & 0xf;
         uint16_t r = (value >> 4) & 0xf;
         uint16_t a = (value) & 0xf;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t a = (value) & 0xf;
         uint16_t r = (value >> 4) & 0xf;
         uint16_t g = (value >> 8) & 0xf;
         uint16_t b = value >> 12;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_a4r4g4b4_uint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint16_t)MIN2(src[2], 15)) << 12;
         value |= (uint32_t)(((uint16_t)MIN2(src[1], 15)) & 0xf) << 8;
         value |= (uint32_t)(((uint16_t)MIN2(src[0], 15)) & 0xf) << 4;
         value |= ((uint16_t)MIN2(src[3], 15)) & 0xf;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint16_t)MIN2(src[3], 15)) & 0xf;
         value |= (uint32_t)(((uint16_t)MIN2(src[0], 15)) & 0xf) << 4;
         value |= (uint32_t)(((uint16_t)MIN2(src[1], 15)) & 0xf) << 8;
         value |= (uint32_t)((uint16_t)MIN2(src[2], 15)) << 12;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a4r4g4b4_uint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   unsigned *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t b = value >> 12;
         uint16_t g = (value >> 8) & 0xf;
         uint16_t r = (value >> 4) & 0xf;
         uint16_t a = (value) & 0xf;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t a = (value) & 0xf;
         uint16_t r = (value >> 4) & 0xf;
         uint16_t g = (value >> 8) & 0xf;
         uint16_t b = value >> 12;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#endif
}

void
util_format_a4r4g4b4_uint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint16_t)CLAMP(src[2], 0, 15)) << 12;
         value |= (uint32_t)(((uint16_t)CLAMP(src[1], 0, 15)) & 0xf) << 8;
         value |= (uint32_t)(((uint16_t)CLAMP(src[0], 0, 15)) & 0xf) << 4;
         value |= ((uint16_t)CLAMP(src[3], 0, 15)) & 0xf;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint16_t)CLAMP(src[3], 0, 15)) & 0xf;
         value |= (uint32_t)(((uint16_t)CLAMP(src[0], 0, 15)) & 0xf) << 4;
         value |= (uint32_t)(((uint16_t)CLAMP(src[1], 0, 15)) & 0xf) << 8;
         value |= (uint32_t)((uint16_t)CLAMP(src[2], 0, 15)) << 12;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a4b4g4r4_uint_unpack_unsigned(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   unsigned *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = value >> 12;
         uint16_t g = (value >> 8) & 0xf;
         uint16_t b = (value >> 4) & 0xf;
         uint16_t a = (value) & 0xf;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t a = (value) & 0xf;
         uint16_t b = (value >> 4) & 0xf;
         uint16_t g = (value >> 8) & 0xf;
         uint16_t r = value >> 12;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_a4b4g4r4_uint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint16_t)MIN2(src[0], 15)) << 12;
         value |= (uint32_t)(((uint16_t)MIN2(src[1], 15)) & 0xf) << 8;
         value |= (uint32_t)(((uint16_t)MIN2(src[2], 15)) & 0xf) << 4;
         value |= ((uint16_t)MIN2(src[3], 15)) & 0xf;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint16_t)MIN2(src[3], 15)) & 0xf;
         value |= (uint32_t)(((uint16_t)MIN2(src[2], 15)) & 0xf) << 4;
         value |= (uint32_t)(((uint16_t)MIN2(src[1], 15)) & 0xf) << 8;
         value |= (uint32_t)((uint16_t)MIN2(src[0], 15)) << 12;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a4b4g4r4_uint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   unsigned *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = value >> 12;
         uint16_t g = (value >> 8) & 0xf;
         uint16_t b = (value >> 4) & 0xf;
         uint16_t a = (value) & 0xf;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t a = (value) & 0xf;
         uint16_t b = (value >> 4) & 0xf;
         uint16_t g = (value >> 8) & 0xf;
         uint16_t r = value >> 12;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#endif
}

void
util_format_a4b4g4r4_uint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint16_t)CLAMP(src[0], 0, 15)) << 12;
         value |= (uint32_t)(((uint16_t)CLAMP(src[1], 0, 15)) & 0xf) << 8;
         value |= (uint32_t)(((uint16_t)CLAMP(src[2], 0, 15)) & 0xf) << 4;
         value |= ((uint16_t)CLAMP(src[3], 0, 15)) & 0xf;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint16_t)CLAMP(src[3], 0, 15)) & 0xf;
         value |= (uint32_t)(((uint16_t)CLAMP(src[2], 0, 15)) & 0xf) << 4;
         value |= (uint32_t)(((uint16_t)CLAMP(src[1], 0, 15)) & 0xf) << 8;
         value |= (uint32_t)((uint16_t)CLAMP(src[0], 0, 15)) << 12;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a1r5g5b5_uint_unpack_unsigned(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   unsigned *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t b = value >> 11;
         uint16_t g = (value >> 6) & 0x1f;
         uint16_t r = (value >> 1) & 0x1f;
         uint16_t a = (value) & 0x1;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t a = (value) & 0x1;
         uint16_t r = (value >> 1) & 0x1f;
         uint16_t g = (value >> 6) & 0x1f;
         uint16_t b = value >> 11;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_a1r5g5b5_uint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint16_t)MIN2(src[2], 31)) << 11;
         value |= (uint32_t)(((uint16_t)MIN2(src[1], 31)) & 0x1f) << 6;
         value |= (uint32_t)(((uint16_t)MIN2(src[0], 31)) & 0x1f) << 1;
         value |= ((uint16_t)MIN2(src[3], 1)) & 0x1;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint16_t)MIN2(src[3], 1)) & 0x1;
         value |= (uint32_t)(((uint16_t)MIN2(src[0], 31)) & 0x1f) << 1;
         value |= (uint32_t)(((uint16_t)MIN2(src[1], 31)) & 0x1f) << 6;
         value |= (uint32_t)((uint16_t)MIN2(src[2], 31)) << 11;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a1r5g5b5_uint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   unsigned *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t b = value >> 11;
         uint16_t g = (value >> 6) & 0x1f;
         uint16_t r = (value >> 1) & 0x1f;
         uint16_t a = (value) & 0x1;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t a = (value) & 0x1;
         uint16_t r = (value >> 1) & 0x1f;
         uint16_t g = (value >> 6) & 0x1f;
         uint16_t b = value >> 11;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#endif
}

void
util_format_a1r5g5b5_uint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint16_t)CLAMP(src[2], 0, 31)) << 11;
         value |= (uint32_t)(((uint16_t)CLAMP(src[1], 0, 31)) & 0x1f) << 6;
         value |= (uint32_t)(((uint16_t)CLAMP(src[0], 0, 31)) & 0x1f) << 1;
         value |= ((uint16_t)CLAMP(src[3], 0, 1)) & 0x1;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint16_t)CLAMP(src[3], 0, 1)) & 0x1;
         value |= (uint32_t)(((uint16_t)CLAMP(src[0], 0, 31)) & 0x1f) << 1;
         value |= (uint32_t)(((uint16_t)CLAMP(src[1], 0, 31)) & 0x1f) << 6;
         value |= (uint32_t)((uint16_t)CLAMP(src[2], 0, 31)) << 11;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a1b5g5r5_uint_unpack_unsigned(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   unsigned *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = value >> 11;
         uint16_t g = (value >> 6) & 0x1f;
         uint16_t b = (value >> 1) & 0x1f;
         uint16_t a = (value) & 0x1;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t a = (value) & 0x1;
         uint16_t b = (value >> 1) & 0x1f;
         uint16_t g = (value >> 6) & 0x1f;
         uint16_t r = value >> 11;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_a1b5g5r5_uint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint16_t)MIN2(src[0], 31)) << 11;
         value |= (uint32_t)(((uint16_t)MIN2(src[1], 31)) & 0x1f) << 6;
         value |= (uint32_t)(((uint16_t)MIN2(src[2], 31)) & 0x1f) << 1;
         value |= ((uint16_t)MIN2(src[3], 1)) & 0x1;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint16_t)MIN2(src[3], 1)) & 0x1;
         value |= (uint32_t)(((uint16_t)MIN2(src[2], 31)) & 0x1f) << 1;
         value |= (uint32_t)(((uint16_t)MIN2(src[1], 31)) & 0x1f) << 6;
         value |= (uint32_t)((uint16_t)MIN2(src[0], 31)) << 11;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_a1b5g5r5_uint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   unsigned *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = value >> 11;
         uint16_t g = (value >> 6) & 0x1f;
         uint16_t b = (value >> 1) & 0x1f;
         uint16_t a = (value) & 0x1;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t a = (value) & 0x1;
         uint16_t b = (value >> 1) & 0x1f;
         uint16_t g = (value >> 6) & 0x1f;
         uint16_t r = value >> 11;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#endif
}

void
util_format_a1b5g5r5_uint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint16_t)CLAMP(src[0], 0, 31)) << 11;
         value |= (uint32_t)(((uint16_t)CLAMP(src[1], 0, 31)) & 0x1f) << 6;
         value |= (uint32_t)(((uint16_t)CLAMP(src[2], 0, 31)) & 0x1f) << 1;
         value |= ((uint16_t)CLAMP(src[3], 0, 1)) & 0x1;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint16_t)CLAMP(src[3], 0, 1)) & 0x1;
         value |= (uint32_t)(((uint16_t)CLAMP(src[2], 0, 31)) & 0x1f) << 1;
         value |= (uint32_t)(((uint16_t)CLAMP(src[1], 0, 31)) & 0x1f) << 6;
         value |= (uint32_t)((uint16_t)CLAMP(src[0], 0, 31)) << 11;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r5g5b5a1_uint_unpack_unsigned(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   unsigned *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t a = value >> 15;
         uint16_t b = (value >> 10) & 0x1f;
         uint16_t g = (value >> 5) & 0x1f;
         uint16_t r = (value) & 0x1f;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = (value) & 0x1f;
         uint16_t g = (value >> 5) & 0x1f;
         uint16_t b = (value >> 10) & 0x1f;
         uint16_t a = value >> 15;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_r5g5b5a1_uint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint16_t)MIN2(src[3], 1)) << 15;
         value |= (uint32_t)(((uint16_t)MIN2(src[2], 31)) & 0x1f) << 10;
         value |= (uint32_t)(((uint16_t)MIN2(src[1], 31)) & 0x1f) << 5;
         value |= ((uint16_t)MIN2(src[0], 31)) & 0x1f;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint16_t)MIN2(src[0], 31)) & 0x1f;
         value |= (uint32_t)(((uint16_t)MIN2(src[1], 31)) & 0x1f) << 5;
         value |= (uint32_t)(((uint16_t)MIN2(src[2], 31)) & 0x1f) << 10;
         value |= (uint32_t)((uint16_t)MIN2(src[3], 1)) << 15;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r5g5b5a1_uint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   unsigned *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t a = value >> 15;
         uint16_t b = (value >> 10) & 0x1f;
         uint16_t g = (value >> 5) & 0x1f;
         uint16_t r = (value) & 0x1f;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r = (value) & 0x1f;
         uint16_t g = (value >> 5) & 0x1f;
         uint16_t b = (value >> 10) & 0x1f;
         uint16_t a = value >> 15;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#endif
}

void
util_format_r5g5b5a1_uint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint16_t)CLAMP(src[3], 0, 1)) << 15;
         value |= (uint32_t)(((uint16_t)CLAMP(src[2], 0, 31)) & 0x1f) << 10;
         value |= (uint32_t)(((uint16_t)CLAMP(src[1], 0, 31)) & 0x1f) << 5;
         value |= ((uint16_t)CLAMP(src[0], 0, 31)) & 0x1f;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint16_t)CLAMP(src[0], 0, 31)) & 0x1f;
         value |= (uint32_t)(((uint16_t)CLAMP(src[1], 0, 31)) & 0x1f) << 5;
         value |= (uint32_t)(((uint16_t)CLAMP(src[2], 0, 31)) & 0x1f) << 10;
         value |= (uint32_t)((uint16_t)CLAMP(src[3], 0, 1)) << 15;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b5g5r5a1_uint_unpack_unsigned(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   unsigned *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t a = value >> 15;
         uint16_t r = (value >> 10) & 0x1f;
         uint16_t g = (value >> 5) & 0x1f;
         uint16_t b = (value) & 0x1f;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t b = (value) & 0x1f;
         uint16_t g = (value >> 5) & 0x1f;
         uint16_t r = (value >> 10) & 0x1f;
         uint16_t a = value >> 15;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#endif
      src += 2;
      dst += 4;
   }
}

void
util_format_b5g5r5a1_uint_pack_unsigned(uint8_t *restrict dst_row, unsigned dst_stride, const unsigned *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint16_t)MIN2(src[3], 1)) << 15;
         value |= (uint32_t)(((uint16_t)MIN2(src[0], 31)) & 0x1f) << 10;
         value |= (uint32_t)(((uint16_t)MIN2(src[1], 31)) & 0x1f) << 5;
         value |= ((uint16_t)MIN2(src[2], 31)) & 0x1f;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint16_t)MIN2(src[2], 31)) & 0x1f;
         value |= (uint32_t)(((uint16_t)MIN2(src[1], 31)) & 0x1f) << 5;
         value |= (uint32_t)(((uint16_t)MIN2(src[0], 31)) & 0x1f) << 10;
         value |= (uint32_t)((uint16_t)MIN2(src[3], 1)) << 15;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_b5g5r5a1_uint_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, UNUSED unsigned i, UNUSED unsigned j)
{
   unsigned *dst = in_dst;
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t a = value >> 15;
         uint16_t r = (value >> 10) & 0x1f;
         uint16_t g = (value >> 5) & 0x1f;
         uint16_t b = (value) & 0x1f;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t b = (value) & 0x1f;
         uint16_t g = (value >> 5) & 0x1f;
         uint16_t r = (value >> 10) & 0x1f;
         uint16_t a = value >> 15;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#endif
}

void
util_format_b5g5r5a1_uint_pack_signed(uint8_t *restrict dst_row, unsigned dst_stride, const int *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint16_t)CLAMP(src[3], 0, 1)) << 15;
         value |= (uint32_t)(((uint16_t)CLAMP(src[0], 0, 31)) & 0x1f) << 10;
         value |= (uint32_t)(((uint16_t)CLAMP(src[1], 0, 31)) & 0x1f) << 5;
         value |= ((uint16_t)CLAMP(src[2], 0, 31)) & 0x1f;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint16_t)CLAMP(src[2], 0, 31)) & 0x1f;
         value |= (uint32_t)(((uint16_t)CLAMP(src[1], 0, 31)) & 0x1f) << 5;
         value |= (uint32_t)(((uint16_t)CLAMP(src[0], 0, 31)) & 0x1f) << 10;
         value |= (uint32_t)((uint16_t)CLAMP(src[3], 0, 1)) << 15;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

void
util_format_r8g8b8x8_snorm_unpack_rgba_float(void *restrict dst_row, const uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t r = ((int32_t)(value) ) >> 24;
         int32_t g = ((int32_t)(value << 8) ) >> 24;
         int32_t b = ((int32_t)(value << 16) ) >> 24;
         dst[0] = (float)(r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(g * (1.0f/0x7f)); /* g */
         dst[2] = (float)(b * (1.0f/0x7f)); /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r = ((int32_t)(value << 24) ) >> 24;
         int32_t g = ((int32_t)(value << 16) ) >> 24;
         int32_t b = ((int32_t)(value << 8) ) >> 24;
         dst[0] = (float)(r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(g * (1.0f/0x7f)); /* g */
         dst[2] = (float)(b * (1.0f/0x7f)); /* b */
         dst[3] = 1; /* a */
#endif
      src += 4;
      dst += 4;
   }
}

void
util_format_r8g8b8x8_snorm_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride, const float *restrict src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((int8_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7f)) << 24) ;
         value |= (uint32_t)((uint32_t)(((int8_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0x7f)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)(((int8_t)util_iround(CLAMP(src[2], -1.0f, 1.0f) * 0x7f)) & 0xff) << 8) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((int8_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7f)) & 0xff) ;
         value |= (uint