Move vertex format to utils.

Improve enum declaration.
Merge vertex and param formats.
2023-01-07 06:30:08 +01:00 · 2023-01-06 18:42:58 +01:00 · 2023-01-06 18:40:58 +01:00 · 2023-01-06 18:16:10 +01:00 · 2023-01-06 18:15:19 +01:00 · 2023-01-06 17:05:09 +01:00
9 changed files with 816 additions and 458 deletions
--- a/2
+++ b/2
@ -21,7 +21,7 @@ cpp/math.cpp

 OUTPUTFILE = engine.so

-CXXFLAGS = -fpic -Wall -Werror -O2 -flto -fomit-frame-pointer -ffast-math -funroll-loops -fno-rtti -fno-exceptions
+CXXFLAGS = -std=c++17 -Wall -Werror -O2 -msse2 -fpic -flto -fno-rtti -fno-exceptions

 .PHONY: all
 all: clean $(OUTPUTFILE)
--- a/init.py
+++ b/init.py
@ -44,6 +44,10 @@ def _ushort_addr(x):
    assert x.typecode == 'H'
    return x.buffer_info()[0]

+def _uint_addr(x):
+    assert x.typecode == 'I'
+    return x.buffer_info()[0]
+
 def _float_addr(x):
    assert x.typecode == 'f'
    return x.buffer_info()[0]
@ -264,34 +268,28 @@ TEXTURE_FLAG_MAG_NEAREST = 0
 TEXTURE_FLAG_MAG_LINEAR = _flag(3)

 VERTEX_FORMAT_VEC3_FLOAT = 1
-VERTEX_FORMAT_VEC3_INT10 = 2
-VERTEX_FORMAT_VEC3_UINT10 = 3
+VERTEX_FORMAT_VEC3_SHORT = 2
+VERTEX_FORMAT_VEC3_INT10 = 3
+VERTEX_FORMAT_VEC3_UINT10 = 4
+VERTEX_FORMAT_MAT3_FLOAT = 5
+VERTEX_FORMAT_MAT3_INT10 = 6
 VERTEX_FORMAT_NORMALIZE = _flag(7)
+_VERTEX_FORMAT_MASK = VERTEX_FORMAT_NORMALIZE - 1

 def vertex_format(*format):
    return array('B', format).tobytes()

-PARAM_FORMAT_VEC3_FLOAT = 1
-PARAM_FORMAT_VEC3_SHORT = 2
-PARAM_FORMAT_VEC3_INT10 = 3
-PARAM_FORMAT_MAT3_FLOAT = 4
-PARAM_FORMAT_MAT3_INT10 = 5
-PARAM_FORMAT_NORMALIZE = _flag(7)
-_PARAM_FORMAT_MASK = PARAM_FORMAT_NORMALIZE - 1
-
-_PARAMS_TYPES = (
+_VERTEX_TYPES = (
    None,
-    vec3, # PARAM_FORMAT_VEC3_FLOAT
-    vec3, # PARAM_FORMAT_VEC3_SHORT
-    vec3, # PARAM_FORMAT_VEC3_INT10
-    mat3, # PARAM_FORMAT_MAT3_FLOAT
-    mat3) # PARAM_FORMAT_MAT3_INT10
+    vec3, # VERTEX_FORMAT_VEC3_FLOAT
+    vec3, # VERTEX_FORMAT_VEC3_SHORT
+    vec3, # VERTEX_FORMAT_VEC3_INT10
+    vec3, # VERTEX_FORMAT_VEC3_UINT10
+    mat3, # VERTEX_FORMAT_MAT3_FLOAT
+    mat3) # VERTEX_FORMAT_MAT3_INT10

-def param_type(format):
-    return _PARAMS_TYPES[format & _PARAM_FORMAT_MASK]
-
-def params_format(*format):
-    return array('B', format).tobytes()
+def vertex_type(format):
+    return _VERTEX_TYPES[format & _VERTEX_FORMAT_MASK]

 INSTANCE_FLAG_SPAWNED = _flag(0)
 INSTANCE_FLAG_VISIBLE = _flag(1)
@ -371,10 +369,14 @@ _create_vertices.argtypes = (
    ctypes.c_uint, # nvertices
    ctypes.c_void_p, # vertices
    ctypes.c_uint, # nindices
-    ctypes.c_void_p) # indices
+    ctypes.c_void_p, # vertices
+    ctypes.c_uint, # nmeshes
+    ctypes.c_void_p) # meshes

-def create_vertices(format, nvertices, vertices, indices):
-    return _create_vertices(format, nvertices, _ubyte_addr(vertices), len(indices), _ushort_addr(indices))
+def create_vertices(format, nvertices, vertices, indices, meshes):
+    assert len(meshes) % 2 == 0
+    return _create_vertices(format,
+        nvertices, _ubyte_addr(vertices), len(indices), _ushort_addr(indices), len(meshes) // 2, _uint_addr(meshes))

 create_batch = _engine.rk_create_batch
 create_batch.restype = _handle
@ -382,9 +384,17 @@ create_batch.errcheck = _check_handle
 create_batch.argtypes = (
    ctypes.c_void_p, # vertices
    ctypes.c_uint, # max_size
-    ctypes.c_uint, # max_meshes
    ctypes.c_char_p) # params_format

+fill_batch = _engine.rk_fill_batch
+fill_batch.restype = None
+fill_batch.argtypes = (
+    ctypes.c_void_p, # batch
+    ctypes.c_uint, # count
+    ctypes.POINTER(ctypes.c_ubyte), # flags
+    ctypes.POINTER(ctypes.c_ushort), # meshes
+    ctypes.POINTER(ctypes.c_void_p)) # params
+
 clear_buffer = _engine.rk_clear_buffer
 clear_buffer.restype = None
 clear_buffer.argtypes = (
@ -444,15 +454,6 @@ draw_triangles.restype = None
 draw_triangles.argtypes = (
    ctypes.c_void_p,) # triangles

-fill_batch = _engine.rk_fill_batch
-fill_batch.restype = None
-fill_batch.argtypes = (
-    ctypes.c_void_p, # batch
-    ctypes.c_uint, # count
-    ctypes.POINTER(ctypes.c_ubyte), # flags
-    ctypes.POINTER(ctypes.c_uint), # meshes
-    ctypes.POINTER(ctypes.c_void_p)) # params
-
 draw_batch = _engine.rk_draw_batch
 draw_batch.restype = None
 draw_batch.argtypes = (
--- a/cpp/math.hpp
+++ b/cpp/math.hpp
@ -26,6 +26,14 @@ typedef glm::vec4 rk_vec4;
 typedef glm::mat3 rk_mat3;
 typedef glm::mat4 rk_mat4;

+#define RK_CHECK_MATH_TYPE(_t, _e, _c) static_assert(sizeof(_t) == sizeof(_e) * (_c))
+
+RK_CHECK_MATH_TYPE(rk_vec2, float, 2);
+RK_CHECK_MATH_TYPE(rk_vec3, float, 3);
+RK_CHECK_MATH_TYPE(rk_vec4, float, 4);
+RK_CHECK_MATH_TYPE(rk_mat3, rk_vec3, 3);
+RK_CHECK_MATH_TYPE(rk_mat4, rk_vec4, 4);
+
 #define vec3_right   (rk_vec3(1.f, 0.f, 0.f))
 #define vec3_forward (rk_vec3(0.f, 1.f, 0.f))
 #define vec3_up      (rk_vec3(0.f, 0.f, 1.f))
--- a/cpp/render.hpp
+++ b/cpp/render.hpp
@ -27,14 +27,18 @@ typedef rk_handle_t rk_triangles_t;
 typedef rk_handle_t rk_vertices_t;
 typedef rk_handle_t rk_batch_t;

-enum rk_texture_format : rk_uint {
+typedef rk_uint rk_texture_format;
+
+enum : rk_texture_format {
    RK_TEXTURE_FORMAT_SRGB8_A8 = 0,
    RK_TEXTURE_FORMAT_RGBA8 = 1,
    RK_TEXTURE_FORMAT_RGB10_A2 = 2,
    RK_TEXTURE_FORMAT_FLOAT_32 = 3
 };

-enum rk_texture_flags : rk_uint {
+typedef rk_uint rk_texture_flags;
+
+enum : rk_texture_flags {
    RK_TEXTURE_FLAG_3D = RK_FLAG(0),
    RK_TEXTURE_FLAG_MIPMAPS = RK_FLAG(1),
    RK_TEXTURE_FLAG_MIN_NEAREST = 0,
@ -43,41 +47,40 @@ enum rk_texture_flags : rk_uint {
    RK_TEXTURE_FLAG_MAG_LINEAR = RK_FLAG(3),
 };

-enum rk_vertex_format : rk_ubyte {
+typedef rk_ubyte rk_vertex_format;
+
+enum : rk_vertex_format {
    RK_VERTEX_FORMAT_VEC3_FLOAT = 1,
-    RK_VERTEX_FORMAT_VEC3_INT10 = 2,
-    RK_VERTEX_FORMAT_VEC3_UINT10 = 3
+    RK_VERTEX_FORMAT_VEC3_SHORT = 2,
+    RK_VERTEX_FORMAT_VEC3_INT10 = 3,
+    RK_VERTEX_FORMAT_VEC3_UINT10 = 4,
+    RK_VERTEX_FORMAT_MAT3_FLOAT = 5,
+    RK_VERTEX_FORMAT_MAT3_INT10 = 6,
+    RK_VERTEX_FORMAT_NORMALIZE = RK_FLAG(7),
+    RK_VERTEX_FORMAT_MASK = RK_VERTEX_FORMAT_NORMALIZE - 1
 };

-enum : rk_ubyte { RK_VERTEX_FORMAT_NORMALIZE = RK_FLAG(7) };
-enum : rk_ubyte { RK_VERTEX_FORMAT_MASK = RK_VERTEX_FORMAT_NORMALIZE - 1 };
+typedef rk_ubyte rk_instance_flags;

-enum rk_param_format : rk_ubyte {
-    RK_PARAM_FORMAT_VEC3_FLOAT = 1,
-    RK_PARAM_FORMAT_VEC3_SHORT = 2,
-    RK_PARAM_FORMAT_VEC3_INT10 = 3,
-    RK_PARAM_FORMAT_MAT3_FLOAT = 4,
-    RK_PARAM_FORMAT_MAT3_INT10 = 5
-};
-
-enum : rk_ubyte { RK_PARAM_FORMAT_NORMALIZE = RK_FLAG(7) };
-enum : rk_ubyte { RK_PARAM_FORMAT_MASK = RK_PARAM_FORMAT_NORMALIZE - 1 };
-
-enum rk_instance_flags : rk_ubyte {
+enum : rk_instance_flags {
    RK_INSTANCE_FLAG_SPAWNED = RK_FLAG(0),
-    RK_INSTANCE_FLAG_VISIBLE = RK_FLAG(1)
+    RK_INSTANCE_FLAG_VISIBLE = RK_FLAG(1),
+    RK_INSTANCE_FLAGS_SPAWNED_VISIBLE = RK_INSTANCE_FLAG_SPAWNED | RK_INSTANCE_FLAG_VISIBLE
 };

-enum : rk_ubyte { RK_INSTANCE_FLAGS_SPAWNED_VISIBLE = RK_INSTANCE_FLAG_SPAWNED | RK_INSTANCE_FLAG_VISIBLE };
+enum : rk_uint {
+    RK_BATCH_MAX_SIZE = 65536
+};

-enum : rk_uint { RK_BATCH_MAX_SIZE = 65536 };
+typedef rk_ushort rk_vertex_index;
+typedef rk_ushort rk_mesh_index;
+typedef rk_ushort rk_instance_index;
+typedef rk_uint rk_vertex_input;
+typedef rk_uint rk_vertex_output;

-union rk_mesh {
-    rk_uint packed;
-    struct {
-        rk_ushort base_index;
-        rk_ushort ntriangles;
-    };
+struct rk_mesh {
+    rk_uint base_index;
+    rk_uint ntriangles;
 };

 RK_EXPORT void rk_render_initialize(
@ -116,13 +119,21 @@ RK_EXPORT rk_vertices_t rk_create_vertices(
    rk_uint nvertices,
    rk_ubyte const * vertices,
    rk_uint nindices,
-    rk_ushort const * indices);
+    rk_vertex_index const * indices,
+    rk_uint nmeshes,
+    rk_mesh const * meshes);

 RK_EXPORT rk_batch_t rk_create_batch(
    rk_vertices_t vertices,
    rk_uint max_size,
-    rk_uint max_meshes,
-    rk_param_format const * params_format);
+    rk_vertex_format const * params_format);
+
+RK_EXPORT void rk_fill_batch(
+    rk_batch_t batch,
+    rk_uint count,
+    rk_instance_flags const * flags,
+    rk_mesh_index const * meshes,
+    rk_vertex_input const * const * params);

 RK_EXPORT void rk_clear_buffer(
    rk_bool pixels,
@ -163,13 +174,6 @@ RK_EXPORT void rk_select_texture(
 RK_EXPORT void rk_draw_triangles(
    rk_triangles_t triangles);

-RK_EXPORT void rk_fill_batch(
-    rk_batch_t batch,
-    rk_uint count,
-    rk_instance_flags const * flags,
-    rk_mesh const * meshes,
-    rk_ubyte const * const * params);
-
 RK_EXPORT void rk_draw_batch(
    rk_batch_t batch);

--- a/cpp/render/render_opengles.cpp
+++ b/cpp/render/render_opengles.cpp
@ -13,9 +13,11 @@
 // You should have received a copy of the GNU Affero General Public License
 // along with this program.  If not, see <http://www.gnu.org/licenses/>.

-#include "../render.hpp"
 #include "render_opengles.hpp"
 #include "../display/display_glx.hpp"
+#include "../utils/vertex_format.hpp"
+#include "../utils/cmp_memcpy.hpp"
+#include <cstdlib>
 #include <cstdio>
 #include <cstring>

@ -25,6 +27,9 @@ typedef void (*rk_MultiDrawElementsIndirectFunc)(rk_uint, rk_uint, const void *,
 static rk_DrawElementsInstancedBaseInstanceFunc rk_DrawElementsInstancedBaseInstance = nullptr;
 static rk_MultiDrawElementsIndirectFunc rk_MultiDrawElementsIndirect = nullptr;

+static unsigned rk_nbuckets = 0;
+static rk_bucket * rk_buckets = nullptr;
+
 static void rk_gl_printf(char const * message) {
    printf("[GL] %s\n", message);
 }
@ -314,7 +319,9 @@ rk_vertices_t rk_create_vertices(
    rk_uint nvertices,
    rk_ubyte const * _vertices,
    rk_uint nindices,
-    rk_ushort const * indices) {
+    rk_vertex_index const * indices,
+    rk_uint nmeshes,
+    rk_mesh const * meshes) {
    if (!format || !nvertices || !_vertices || !nindices || !indices) {
        rk_printf("rk_create_vertices(): invalid params.");
        return RK_INVALID_HANDLE;
@ -324,13 +331,13 @@ rk_vertices_t rk_create_vertices(
    for (rk_vertex_format const * f = format; *f; ++f, ++format_size) {
        switch (*f & RK_VERTEX_FORMAT_MASK) {
            case RK_VERTEX_FORMAT_VEC3_FLOAT:
-                vertex_size += sizeof(rk_vec3_float);
+                vertex_size += rk_vec3_float::get_output_size();
                break;
            case RK_VERTEX_FORMAT_VEC3_INT10:
-                vertex_size += sizeof(rk_vec3_int10);
+                vertex_size += rk_vec3_int10::get_output_size();
                break;
            case RK_VERTEX_FORMAT_VEC3_UINT10:
-                vertex_size += sizeof(rk_vec3_uint10);
+                vertex_size += rk_vec3_uint10::get_output_size();
                break;
            default:
                rk_printf("rk_create_vertices(): invalid vertex format.");
@ -345,152 +352,80 @@ rk_vertices_t rk_create_vertices(
    rk_vertices * const vertices = new rk_vertices;
    vertices->nvertices = nvertices;
    vertices->nindices = nindices;
+    vertices->nmeshes = nmeshes;
    vertices->format = new rk_vertex_format[format_size + 1];
    memcpy(vertices->format, format, (format_size + 1) * sizeof(rk_vertex_format));
    vertices->vertices = new rk_ubyte[nvertices * vertex_size];
    memcpy(vertices->vertices, _vertices, nvertices * vertex_size);
-    vertices->indices = new rk_ushort[nindices];
-    memcpy(vertices->indices, indices, nindices * sizeof(rk_ushort));
+    vertices->indices = new rk_vertex_index[nindices];
+    memcpy(vertices->indices, indices, nindices * sizeof(rk_vertex_index));
+    vertices->meshes = new rk_mesh[nmeshes];
+    memcpy(vertices->meshes, meshes, nmeshes * sizeof(rk_mesh));
+    vertices->vertices_buffer = 0;
+    vertices->indices_buffer = 0;
    return reinterpret_cast<rk_vertices_t>(vertices);
 }

-static void rk_pack_vec3_float(
-    unsigned const count,
-    rk_ushort const * const __restrict indices,
-    rk_ubyte * __restrict _dst,
-    rk_ubyte const * const __restrict _src) {
-    rk_ushort const * const last_index = indices + count;
-    rk_vec3_float * __restrict dst = reinterpret_cast<rk_vec3_float *>(_dst);
-    rk_vec3_float const * const __restrict src = reinterpret_cast<rk_vec3_float const *>(_src);
-    for (rk_ushort const * __restrict index = indices; index < last_index; ++index, ++dst) {
-        *dst = src[*index];
+static void rk_buckets_alloc(
+    rk_batch const & batch) {
+    unsigned const count = batch.vertices->nmeshes;
+    unsigned const size = batch.max_size;
+    bool reallocated = false;
+    if (!rk_nbuckets) {
+        rk_nbuckets = count;
+        rk_buckets = reinterpret_cast<rk_bucket *>(malloc(count * sizeof(rk_bucket)));
+        for (unsigned index = 0; index < count; ++index) {
+            rk_bucket & bucket = rk_buckets[index];
+            bucket.size = size;
+            bucket.indices = reinterpret_cast<rk_instance_index *>(malloc(size * sizeof(rk_instance_index)));
+        }
+        reallocated = true;
    }
-}
-
-static void rk_pack_vec3_short(
-    unsigned const count,
-    rk_ushort const * const __restrict indices,
-    rk_ubyte * __restrict _dst,
-    rk_ubyte const * const __restrict _src) {
-    rk_ushort const * const last_index = indices + count;
-    rk_vec3_short * __restrict dst = reinterpret_cast<rk_vec3_short *>(_dst);
-    rk_vec3_float const * const __restrict src = reinterpret_cast<rk_vec3_float const *>(_src);
-    for (rk_ushort const * __restrict index = indices; index < last_index; ++index, ++dst) {
-        rk_vec3_float const & input = src[*index];
-        dst->x = static_cast<rk_short>(input.x);
-        dst->y = static_cast<rk_short>(input.y);
-        dst->z = static_cast<rk_short>(input.z);
-        dst->pad = 0;
+    else if (count <= rk_nbuckets) {
+        for (unsigned index = 0; index < count; ++index) {
+            rk_bucket & bucket = rk_buckets[index];
+            if (bucket.size < size) {
+                bucket.size = size;
+                bucket.indices = reinterpret_cast<rk_instance_index *>(
+                    realloc(bucket.indices, size * sizeof(rk_instance_index)));
+                reallocated = true;
+            }
+        }
    }
-}
-
-static void rk_pack_vec3_short_norm(
-    unsigned const count,
-    rk_ushort const * const __restrict indices,
-    rk_ubyte * __restrict _dst,
-    rk_ubyte const * const __restrict _src) {
-    rk_ushort const * const last_index = indices + count;
-    rk_vec3_short * __restrict dst = reinterpret_cast<rk_vec3_short *>(_dst);
-    rk_vec3_float const * const __restrict src = reinterpret_cast<rk_vec3_float const *>(_src);
-    #define _convert(s) (static_cast<rk_short>((s) * ((s) < 0.f ? 32768.f : 32767.f)))
-    for (rk_ushort const * __restrict index = indices; index < last_index; ++index, ++dst) {
-        rk_vec3_float const & input = src[*index];
-        dst->x = _convert(input.x);
-        dst->y = _convert(input.y);
-        dst->z = _convert(input.z);
-        dst->pad = 0;
+    else {
+        rk_buckets = reinterpret_cast<rk_bucket *>(realloc(rk_buckets, count * sizeof(rk_bucket)));
+        for (unsigned index = 0; index < rk_nbuckets; ++index) {
+            rk_bucket & bucket = rk_buckets[index];
+            if (bucket.size < size) {
+                bucket.size = size;
+                bucket.indices = reinterpret_cast<rk_instance_index *>(
+                    realloc(bucket.indices, size * sizeof(rk_instance_index)));
+            }
+        }
+        for (unsigned index = rk_nbuckets; index < count; ++index) {
+            rk_bucket & bucket = rk_buckets[index];
+            bucket.size = size;
+            bucket.indices = reinterpret_cast<rk_instance_index *>(
+                malloc(size * sizeof(rk_instance_index)));
+        }
+        rk_nbuckets = count;
+        reallocated = true;
    }
-    #undef _convert
-}
-
-static void rk_pack_vec3_int10(
-    unsigned const count,
-    rk_ushort const * const __restrict indices,
-    rk_ubyte * __restrict _dst,
-    rk_ubyte const * const __restrict _src) {
-    rk_ushort const * const last_index = indices + count;
-    rk_vec3_int10 * __restrict dst = reinterpret_cast<rk_vec3_int10 *>(_dst);
-    rk_vec3_float const * const __restrict src = reinterpret_cast<rk_vec3_float const *>(_src);
-    #define _convert(s) (static_cast<rk_int>((s)) & 1023)
-    for (rk_ushort const * __restrict index = indices; index < last_index; ++index, ++dst) {
-        rk_vec3_float const & input = src[*index];
-        *dst = _convert(input.x) | (_convert(input.y) << 10) | (_convert(input.z) << 20);
+    if (reallocated) {
+        unsigned total_size = rk_nbuckets * sizeof(rk_bucket);
+        for (unsigned index = 0; index < rk_nbuckets; ++index) {
+            rk_bucket const & bucket = rk_buckets[index];
+            total_size += bucket.size * sizeof(rk_instance_index);
+        }
+        printf("[RK] rk_buckets_alloc() -> %d KiB\n", total_size / 1024);
    }
-    #undef _convert
-}
-
-static void rk_pack_vec3_int10_norm(
-    unsigned const count,
-    rk_ushort const * const __restrict indices,
-    rk_ubyte * __restrict _dst,
-    rk_ubyte const * const __restrict _src) {
-    rk_ushort const * const last_index = indices + count;
-    rk_vec3_int10 * __restrict dst = reinterpret_cast<rk_vec3_int10 *>(_dst);
-    rk_vec3_float const * const __restrict src = reinterpret_cast<rk_vec3_float const *>(_src);
-    #define _convert(s) (static_cast<rk_int>((s) * ((s) < 0.f ? 512.f : 511.f)) & 1023)
-    for (rk_ushort const * __restrict index = indices; index < last_index; ++index, ++dst) {
-        rk_vec3_float const & input = src[*index];
-        *dst = _convert(input.x) | (_convert(input.y) << 10) | (_convert(input.z) << 20);
-    }
-    #undef _convert
-}
-
-static void rk_pack_mat3_float(
-    unsigned const count,
-    rk_ushort const * const __restrict indices,
-    rk_ubyte * __restrict _dst,
-    rk_ubyte const * const __restrict _src) {
-    rk_ushort const * const last_index = indices + count;
-    rk_mat3_float * __restrict dst = reinterpret_cast<rk_mat3_float *>(_dst);
-    rk_mat3_float const * const __restrict src = reinterpret_cast<rk_mat3_float const *>(_src);
-    for (rk_ushort const * __restrict index = indices; index < last_index; ++index, ++dst) {
-        *dst = src[*index];
-    }
-    #undef _convert
-}
-
-static void rk_pack_mat3_int10(
-    unsigned const count,
-    rk_ushort const * const __restrict indices,
-    rk_ubyte * __restrict _dst,
-    rk_ubyte const * const __restrict _src) {
-    rk_ushort const * const last_index = indices + count;
-    rk_mat3_int10 * __restrict dst = reinterpret_cast<rk_mat3_int10 *>(_dst);
-    rk_mat3_float const * const __restrict src = reinterpret_cast<rk_mat3_float const *>(_src);
-    #define _convert(s) (static_cast<rk_int>((s)) & 1023)
-    for (rk_ushort const * __restrict index = indices; index < last_index; ++index, ++dst) {
-        rk_mat3_float const & input = src[*index];
-        dst->x = _convert(input.x.x) | (_convert(input.x.y) << 10) | (_convert(input.x.z) << 20);
-        dst->y = _convert(input.y.x) | (_convert(input.y.y) << 10) | (_convert(input.y.z) << 20);
-        dst->z = _convert(input.z.x) | (_convert(input.z.y) << 10) | (_convert(input.z.z) << 20);
-    }
-    #undef _convert
-}
-
-static void rk_pack_mat3_int10_norm(
-    unsigned const count,
-    rk_ushort const * const __restrict indices,
-    rk_ubyte * __restrict _dst,
-    rk_ubyte const * const __restrict _src) {
-    rk_ushort const * const last_index = indices + count;
-    rk_mat3_int10 * __restrict dst = reinterpret_cast<rk_mat3_int10 *>(_dst);
-    rk_mat3_float const * const __restrict src = reinterpret_cast<rk_mat3_float const *>(_src);
-    #define _convert(s) (static_cast<rk_int>((s) * ((s) < 0.f ? 512.f : 511.f)) & 1023)
-    for (rk_ushort const * __restrict index = indices; index < last_index; ++index, ++dst) {
-        rk_mat3_float const & input = src[*index];
-        dst->x = _convert(input.x.x) | (_convert(input.x.y) << 10) | (_convert(input.x.z) << 20);
-        dst->y = _convert(input.y.x) | (_convert(input.y.y) << 10) | (_convert(input.y.z) << 20);
-        dst->z = _convert(input.z.x) | (_convert(input.z.y) << 10) | (_convert(input.z.z) << 20);
-    }
-    #undef _convert
 }

 rk_batch_t rk_create_batch(
    rk_vertices_t _vertices,
    rk_uint max_size,
-    rk_uint max_meshes,
-    rk_param_format const * params_format) {
-    rk_vertices const * const vertices = reinterpret_cast<rk_vertices const *>(_vertices);
+    rk_vertex_format const * params_format) {
+    rk_vertices * const vertices = reinterpret_cast<rk_vertices *>(_vertices);
    if (!vertices || !max_size || max_size > RK_BATCH_MAX_SIZE) {
        rk_printf("rk_create_batch(): invalid params.");
        return RK_INVALID_HANDLE;
@ -499,35 +434,35 @@ rk_batch_t rk_create_batch(
    for (rk_vertex_format const * f = vertices->format; *f; ++f) {
        switch (*f & RK_VERTEX_FORMAT_MASK) {
            case RK_VERTEX_FORMAT_VEC3_FLOAT:
-                vertex_size += sizeof(rk_vec3_float);
+                vertex_size += rk_vec3_float::get_output_size();
                break;
            case RK_VERTEX_FORMAT_VEC3_INT10:
-                vertex_size += sizeof(rk_vec3_int10);
+                vertex_size += rk_vec3_int10::get_output_size();
                break;
            case RK_VERTEX_FORMAT_VEC3_UINT10:
-                vertex_size += sizeof(rk_vec3_uint10);
+                vertex_size += rk_vec3_uint10::get_output_size();
                break;
        }
    }
    unsigned nparams = 0;
    unsigned params_size = 0;
    if (params_format) {
-        for (rk_param_format const * f = params_format; *f; ++f, ++nparams) {
-            switch (*f & RK_PARAM_FORMAT_MASK) {
-                case RK_PARAM_FORMAT_VEC3_FLOAT:
-                    params_size += sizeof(rk_vec3_float);
+        for (rk_vertex_format const * f = params_format; *f; ++f, ++nparams) {
+            switch (*f & RK_VERTEX_FORMAT_MASK) {
+                case RK_VERTEX_FORMAT_VEC3_FLOAT:
+                    params_size += rk_vec3_float::get_output_size();
                    break;
-                case RK_PARAM_FORMAT_VEC3_SHORT:
-                    params_size += sizeof(rk_vec3_short);
+                case RK_VERTEX_FORMAT_VEC3_SHORT:
+                    params_size += rk_vec3_short::get_output_size();
                    break;
-                case RK_PARAM_FORMAT_VEC3_INT10:
-                    params_size += sizeof(rk_vec3_int10);
+                case RK_VERTEX_FORMAT_VEC3_INT10:
+                    params_size += rk_vec3_int10::get_output_size();
                    break;
-                case RK_PARAM_FORMAT_MAT3_FLOAT:
-                    params_size += sizeof(rk_mat3_float);
+                case RK_VERTEX_FORMAT_MAT3_FLOAT:
+                    params_size += rk_mat3_float::get_output_size();
                    break;
-                case RK_PARAM_FORMAT_MAT3_INT10:
-                    params_size += sizeof(rk_mat3_int10);
+                case RK_VERTEX_FORMAT_MAT3_INT10:
+                    params_size += rk_mat3_int10::get_output_size();
                    break;
                default:
                    rk_printf("rk_create_batch(): invalid param format.");
@ -542,12 +477,16 @@ rk_batch_t rk_create_batch(
    batch->ncommands = 0;
    batch->ninstances = 0;
    batch->max_size = max_size;
-    batch->max_meshes = max_meshes;
    batch->nparams = nparams;
+    batch->vertices = vertices;
    batch->flags = new rk_instance_flags[max_size];
-    batch->meshes = new rk_mesh[max_size];
-    batch->indices = new rk_ushort[max_size];
-    batch->commands = new rk_command[max_meshes];
+    memset(batch->flags, 0xFF, max_size * sizeof(rk_instance_flags));
+    batch->meshes = new rk_mesh_index[max_size];
+    memset(batch->meshes, 0xFF, max_size * sizeof(rk_mesh_index));
+    batch->indices = new rk_instance_index[max_size];
+    memset(batch->indices, 0, max_size * sizeof(rk_instance_index));
+    batch->commands = new rk_command[vertices->nmeshes];
+    memset(batch->commands, 0, vertices->nmeshes * sizeof(rk_command));
    if (nparams) {
        batch->params = new rk_parameter[nparams];
    } else {
@ -555,17 +494,24 @@ rk_batch_t rk_create_batch(
    }
    glGenVertexArrays(1, &batch->vertex_array);
    glBindVertexArray(batch->vertex_array);
-    glGenBuffers(1, &batch->vertices_buffer);
-    glBindBuffer(GL_ARRAY_BUFFER, batch->vertices_buffer);
-    glBufferData(GL_ARRAY_BUFFER, vertices->nvertices * vertex_size, vertices->vertices, GL_STATIC_DRAW);
-    glBindBuffer(GL_ARRAY_BUFFER, 0);
-    glGenBuffers(1, &batch->indices_buffer);
-    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, batch->indices_buffer);
-    glBufferData(GL_ELEMENT_ARRAY_BUFFER, vertices->nindices * sizeof(rk_ushort), vertices->indices, GL_STATIC_DRAW);
+    if (!vertices->vertices_buffer) {
+        glGenBuffers(1, &vertices->vertices_buffer);
+        glBindBuffer(GL_ARRAY_BUFFER, vertices->vertices_buffer);
+        glBufferData(GL_ARRAY_BUFFER, vertices->nvertices * vertex_size, vertices->vertices, GL_STATIC_DRAW);
+        glBindBuffer(GL_ARRAY_BUFFER, 0);
+    }
+    if (vertices->indices_buffer) {
+        glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, vertices->indices_buffer);
+    } else {
+        glGenBuffers(1, &vertices->indices_buffer);
+        glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, vertices->indices_buffer);
+        glBufferData(GL_ELEMENT_ARRAY_BUFFER,
+            vertices->nindices * sizeof(rk_vertex_index), vertices->indices, GL_STATIC_DRAW);
+    }
    if (rk_MultiDrawElementsIndirect) {
        glGenBuffers(1, &batch->commands_buffer);
        glBindBuffer(GL_DRAW_INDIRECT_BUFFER, batch->commands_buffer);
-        glBufferData(GL_DRAW_INDIRECT_BUFFER, max_meshes * sizeof(rk_command), nullptr, GL_DYNAMIC_DRAW);
+        glBufferData(GL_DRAW_INDIRECT_BUFFER, vertices->nmeshes * sizeof(rk_command), nullptr, GL_DYNAMIC_DRAW);
        glBindBuffer(GL_DRAW_INDIRECT_BUFFER, 0);
    }
    if (nparams) {
@ -577,7 +523,7 @@ rk_batch_t rk_create_batch(
    unsigned binding = 0;
    unsigned attrib = 0;
    unsigned offset = 0;
-    glBindVertexBuffer(binding, batch->vertices_buffer, 0, vertex_size);
+    glBindVertexBuffer(binding, vertices->vertices_buffer, 0, vertex_size);
    for (rk_vertex_format const * f = vertices->format; *f; ++f) {
        GLboolean const norm = (*f & RK_VERTEX_FORMAT_NORMALIZE) != 0;
        switch (*f & RK_VERTEX_FORMAT_MASK) {
@ -585,99 +531,234 @@ rk_batch_t rk_create_batch(
                glEnableVertexAttribArray(attrib);
                glVertexAttribFormat(attrib, 3, GL_FLOAT, GL_FALSE, offset);
                glVertexAttribBinding(attrib++, binding);
-                offset += sizeof(rk_vec3_float);
+                offset += rk_vec3_float::get_output_size();
                break;
            case RK_VERTEX_FORMAT_VEC3_INT10:
                glEnableVertexAttribArray(attrib);
                glVertexAttribFormat(attrib, 4, GL_INT_2_10_10_10_REV, norm, offset);
                glVertexAttribBinding(attrib++, binding);
-                offset += sizeof(rk_vec3_int10);
+                offset += rk_vec3_int10::get_output_size();
                break;
            case RK_VERTEX_FORMAT_VEC3_UINT10:
                glEnableVertexAttribArray(attrib);
                glVertexAttribFormat(attrib, 4, GL_UNSIGNED_INT_2_10_10_10_REV, norm, offset);
                glVertexAttribBinding(attrib++, binding);
-                offset += sizeof(rk_vec3_uint10);
+                offset += rk_vec3_uint10::get_output_size();
                break;
        }
    }
-    ++binding;
-    offset = 0;
+    binding += 1;
    if (nparams) {
+        offset = 0;
        rk_parameter * param = batch->params;
-        for (rk_param_format const * f = params_format; *f; ++f, ++param, ++binding) {
-            GLboolean const norm = (*f & RK_PARAM_FORMAT_NORMALIZE) != 0;
+        for (rk_vertex_format const * f = params_format; *f; ++f, ++param, ++binding) {
+            GLboolean const norm = (*f & RK_VERTEX_FORMAT_NORMALIZE) != 0;
            param->dirty = false;
            param->binding = binding;
            param->offset = offset;
-            switch (*f & RK_PARAM_FORMAT_MASK) {
-                case RK_PARAM_FORMAT_VEC3_FLOAT:
-                    param->src_size = sizeof(rk_vec3);
-                    param->dst_size = sizeof(rk_vec3_float);
-                    param->packer = rk_pack_vec3_float;
+            switch (*f & RK_VERTEX_FORMAT_MASK) {
+                case RK_VERTEX_FORMAT_VEC3_FLOAT:
+                    param->src_size = rk_vec3_float::get_input_size();
+                    param->dst_size = rk_vec3_float::get_output_size();
+                    param->packer = rk_vec3_float::param_packer;
                    glBindVertexBuffer(binding, batch->params_buffer, param->offset, param->dst_size);
                    glEnableVertexAttribArray(attrib);
                    glVertexAttribFormat(attrib, 3, GL_FLOAT, GL_FALSE, 0);
                    glVertexAttribBinding(attrib++, binding);
                    break;
-                case RK_PARAM_FORMAT_VEC3_SHORT:
-                    param->src_size = sizeof(rk_vec3);
-                    param->dst_size = sizeof(rk_vec3_short);
-                    param->packer = norm ? rk_pack_vec3_short_norm : rk_pack_vec3_short;
+                case RK_VERTEX_FORMAT_VEC3_SHORT:
+                    param->src_size = rk_vec3_short::get_input_size();
+                    param->dst_size = rk_vec3_short::get_output_size();
+                    param->packer = norm ? rk_vec3_short_norm::param_packer : rk_vec3_short::param_packer;
                    glBindVertexBuffer(binding, batch->params_buffer, param->offset, param->dst_size);
                    glEnableVertexAttribArray(attrib);
                    glVertexAttribFormat(attrib, 3, GL_SHORT, norm, 0);
                    glVertexAttribBinding(attrib++, binding);
                    break;
-                case RK_PARAM_FORMAT_VEC3_INT10:
-                    param->src_size = sizeof(rk_vec3);
-                    param->dst_size = sizeof(rk_vec3_int10);
-                    param->packer = norm ? rk_pack_vec3_int10_norm : rk_pack_vec3_int10;
+                case RK_VERTEX_FORMAT_VEC3_INT10:
+                    param->src_size = rk_vec3_int10::get_input_size();
+                    param->dst_size = rk_vec3_int10::get_output_size();
+                    param->packer = norm ? rk_vec3_int10_norm::param_packer : rk_vec3_int10::param_packer;
                    glBindVertexBuffer(binding, batch->params_buffer, param->offset, param->dst_size);
                    glEnableVertexAttribArray(attrib);
                    glVertexAttribFormat(attrib, 4, GL_INT_2_10_10_10_REV, norm, 0);
                    glVertexAttribBinding(attrib++, binding);
                    break;
-                case RK_PARAM_FORMAT_MAT3_FLOAT:
-                    param->src_size = sizeof(rk_mat3);
-                    param->dst_size = sizeof(rk_mat3_float);
-                    param->packer = rk_pack_mat3_float;
+                case RK_VERTEX_FORMAT_MAT3_FLOAT:
+                    param->src_size = rk_mat3_float::get_input_size();
+                    param->dst_size = rk_mat3_float::get_output_size();
+                    param->packer = rk_mat3_float::param_packer;
                    glBindVertexBuffer(binding, batch->params_buffer, param->offset, param->dst_size);
                    glEnableVertexAttribArray(attrib);
-                    glVertexAttribFormat(attrib, 3, GL_FLOAT, GL_FALSE, offsetof(rk_mat3_float, x));
+                    glVertexAttribFormat(attrib, 3, GL_FLOAT, GL_FALSE, rk_mat3_float::get_output_offset(0));
                    glVertexAttribBinding(attrib++, binding);
                    glEnableVertexAttribArray(attrib);
-                    glVertexAttribFormat(attrib, 3, GL_FLOAT, GL_FALSE, offsetof(rk_mat3_float, y));
+                    glVertexAttribFormat(attrib, 3, GL_FLOAT, GL_FALSE, rk_mat3_float::get_output_offset(1));
                    glVertexAttribBinding(attrib++, binding);
                    glEnableVertexAttribArray(attrib);
-                    glVertexAttribFormat(attrib, 3, GL_FLOAT, GL_FALSE, offsetof(rk_mat3_float, z));
+                    glVertexAttribFormat(attrib, 3, GL_FLOAT, GL_FALSE, rk_mat3_float::get_output_offset(2));
                    glVertexAttribBinding(attrib++, binding);
                    break;
-                case RK_PARAM_FORMAT_MAT3_INT10:
-                    param->src_size = sizeof(rk_mat3);
-                    param->dst_size = sizeof(rk_mat3_int10);
-                    param->packer = norm ? rk_pack_mat3_int10_norm : rk_pack_mat3_int10;
+                case RK_VERTEX_FORMAT_MAT3_INT10:
+                    param->src_size = rk_mat3_int10::get_input_size();
+                    param->dst_size = rk_mat3_int10::get_output_size();
+                    param->packer = norm ? rk_mat3_int10_norm::param_packer : rk_mat3_int10::param_packer;
                    glBindVertexBuffer(binding, batch->params_buffer, param->offset, param->dst_size);
                    glEnableVertexAttribArray(attrib);
-                    glVertexAttribFormat(attrib, 4, GL_INT_2_10_10_10_REV, norm, offsetof(rk_mat3_int10, x));
+                    glVertexAttribFormat(attrib, 4, GL_INT_2_10_10_10_REV, norm, rk_mat3_int10::get_output_offset(0));
                    glVertexAttribBinding(attrib++, binding);
                    glEnableVertexAttribArray(attrib);
-                    glVertexAttribFormat(attrib, 4, GL_INT_2_10_10_10_REV, norm, offsetof(rk_mat3_int10, y));
+                    glVertexAttribFormat(attrib, 4, GL_INT_2_10_10_10_REV, norm, rk_mat3_int10::get_output_offset(1));
                    glVertexAttribBinding(attrib++, binding);
                    glEnableVertexAttribArray(attrib);
-                    glVertexAttribFormat(attrib, 4, GL_INT_2_10_10_10_REV, norm, offsetof(rk_mat3_int10, z));
+                    glVertexAttribFormat(attrib, 4, GL_INT_2_10_10_10_REV, norm, rk_mat3_int10::get_output_offset(2));
                    glVertexAttribBinding(attrib++, binding);
                    break;
            }
            glVertexBindingDivisor(binding, 1);
-            param->source = new rk_ubyte[max_size * param->src_size];
+            param->src_len = param->src_size / sizeof(rk_vertex_output);
+            param->dst_len = param->dst_size / sizeof(rk_vertex_output);
+            param->source = new rk_vertex_input[max_size * param->src_len];
+            memset(param->source, 0xFF, max_size * param->src_size);
            offset += max_size * param->dst_size;
        }
    }
    glBindVertexArray(0);
+    rk_buckets_alloc(*batch);
    return reinterpret_cast<rk_batch_t>(batch);
 }

+[[RK_HOT, RK_FAST]]
+static void rk_sort_batch(
+    rk_batch const & batch) {
+    rk_bucket const * const last_bucket = rk_buckets + batch.vertices->nmeshes;
+    for (rk_bucket * __restrict bucket = rk_buckets; bucket < last_bucket; ++bucket) {
+        bucket->count = 0;
+    }
+    rk_instance_flags const * __restrict flags = batch.flags;
+    rk_mesh_index const * __restrict mesh_index = batch.meshes;
+    for (unsigned index = 0; index < batch.count; ++index, ++flags, ++mesh_index) {
+        if ((*flags & RK_INSTANCE_FLAGS_SPAWNED_VISIBLE) == RK_INSTANCE_FLAGS_SPAWNED_VISIBLE) {
+            rk_bucket & __restrict bucket = rk_buckets[*mesh_index];
+            bucket.indices[bucket.count++] = index;
+        }
+    }
+    rk_instance_index * __restrict indices = batch.indices;
+    rk_command * __restrict command = batch.commands;
+    rk_mesh const * __restrict mesh = batch.vertices->meshes;
+    for (rk_bucket const * __restrict bucket = rk_buckets; bucket < last_bucket; ++bucket, ++mesh) {
+        if (bucket->count) {
+            memcpy(indices, bucket->indices, bucket->count * sizeof(rk_instance_index));
+            command->nvertices = mesh->ntriangles * 3;
+            command->ninstances = bucket->count;
+            command->base_index = mesh->base_index;
+            command->base_instance = indices - batch.indices;
+            indices += bucket->count;
+            ++command;
+        }
+    }
+    batch.ninstances = indices - batch.indices;
+    batch.ncommands = command - batch.commands;
+    if (rk_MultiDrawElementsIndirect) {
+        glBufferSubData(GL_DRAW_INDIRECT_BUFFER, 0, batch.ncommands * sizeof(rk_command), batch.commands);
+    }
+    batch.state = RK_BATCH_STATE_SORTED;
+}
+
+[[RK_HOT, RK_FAST]]
+static void rk_pack_batch(
+    rk_batch const & batch) {
+    if (batch.nparams) {
+        glBindBuffer(GL_ARRAY_BUFFER, batch.params_buffer);
+        for (rk_parameter const * param = batch.params; param < batch.params + batch.nparams; ++param) {
+            if (param->dirty) {
+                param->dirty = false;
+                if (batch.ninstances) {
+                    rk_vertex_output * const dst = reinterpret_cast<rk_vertex_output *>(
+                        glMapBufferRange(GL_ARRAY_BUFFER, param->offset, batch.ninstances * param->dst_size,
+                            GL_MAP_WRITE_BIT | GL_MAP_INVALIDATE_RANGE_BIT | GL_MAP_UNSYNCHRONIZED_BIT));
+                    if (dst) {
+                        param->packer(batch.ninstances, batch.indices, dst, param->source);
+                        glUnmapBuffer(GL_ARRAY_BUFFER);
+                    }
+                }
+            }
+        }
+        glBindBuffer(GL_ARRAY_BUFFER, 0);
+    }
+    batch.state = RK_BATCH_STATE_PACKED;
+}
+
+[[RK_HOT, RK_FAST]]
+void rk_fill_batch(
+    rk_batch_t _batch,
+    rk_uint count,
+    rk_instance_flags const * flags,
+    rk_mesh_index const * meshes,
+    rk_vertex_input const * const * params) {
+    rk_batch const * const batch = reinterpret_cast<rk_batch const *>(_batch);
+    if (!batch || !count || count > batch->max_size) {
+        rk_printf("rk_fill_batch(): invalid params.");
+        return;
+    }
+    bool got_any_params = false;
+    bool got_all_params = !batch->nparams;
+    if (batch->nparams) {
+        got_all_params = (params != nullptr);
+        if (params) {
+            for (rk_vertex_input const * const * param = params; param < params + batch->nparams; ++param) {
+                bool const got_param = (*param != nullptr);
+                got_any_params |= got_param;
+                got_all_params &= got_param;
+            }
+        }
+    }
+    bool const is_empty = (batch->state < RK_BATCH_STATE_FILLED);
+    bool const resized = (count != batch->count);
+    bool const got_everything = (flags && meshes && got_all_params);
+    if (is_empty && !got_everything) {
+        rk_printf("rk_fill_batch(): cannot freeze and empty batch.");
+        return;
+    } else if (count > batch->count && !got_everything) {
+        rk_printf("rk_fill_batch(): cannot grow a frozen batch.");
+        return;
+    }
+    batch->count = count;
+    bool const cmp_flags = (flags && rk_cmp_memcpy(batch->flags, flags, batch->count));
+    bool const cmp_meshes = (meshes && rk_cmp_memcpy(batch->meshes, meshes, batch->count));
+    bool const need_sorting = (cmp_flags || cmp_meshes || resized);
+    if (batch->nparams) {
+        rk_parameter const * const last_param = batch->params + batch->nparams;
+        if (got_any_params) {
+            rk_vertex_input const * const * src = params;
+            for (rk_parameter const * param = batch->params; param < last_param; ++param, ++src) {
+                param->dirty =
+                    ((*src && rk_cmp_memcpy(param->source, *src, batch->count * param->src_len)) || need_sorting);
+            }
+        } else if (need_sorting) {
+            for (rk_parameter const * param = batch->params; param < last_param; ++param) {
+                param->dirty = true;
+            }
+        }
+    }
+    if (is_empty) {
+        glBindVertexArray(batch->vertex_array);
+        if (rk_MultiDrawElementsIndirect) {
+            glBindBuffer(GL_DRAW_INDIRECT_BUFFER, batch->commands_buffer);
+        }
+        rk_sort_batch(*batch);
+        rk_pack_batch(*batch);
+        if (rk_MultiDrawElementsIndirect) {
+            glBindBuffer(GL_DRAW_INDIRECT_BUFFER, 0);
+        }
+        glBindVertexArray(0);
+    } else if (need_sorting) {
+        batch->state = RK_BATCH_STATE_FILLED;
+    } else {
+        batch->state = RK_BATCH_STATE_SORTED;
+    }
+}
+
 void rk_clear_buffer(
    rk_bool pixels,
    rk_bool depth,
@ -773,121 +854,9 @@ RK_EXPORT void rk_draw_triangles(
    }
 }

-void rk_fill_batch(
-    rk_batch_t _batch,
-    rk_uint count,
-    rk_instance_flags const * flags,
-    rk_mesh const * meshes,
-    rk_ubyte const * const * params) {
-    rk_batch const * const batch = reinterpret_cast<rk_batch const *>(_batch);
-    if (!batch || !count || count > batch->max_size) {
-        rk_printf("rk_fill_batch(): invalid params.");
-        return;
-    }
-    bool const need_params = (batch->nparams > 0);
-    bool got_params = false;
-    bool all_params = false;
-    if (params != nullptr) {
-        all_params = true;
-        for (rk_ubyte const * const * param = params; param < params + batch->nparams; ++param) {
-            bool const got_param = (*param != nullptr);
-            got_params |= got_param;
-            all_params &= got_param;
-        }
-    }
-    bool const got_all = (flags && meshes && (!need_params || all_params));
-    if (count > batch->count && !got_all) {
-        rk_printf("rk_fill_batch(): cannot grow without all flags, meshes and params.");
-        return;
-    }
-    bool const need_sorting = (flags || meshes || count != batch->count);
-    batch->count = count;
-    if (flags) {
-        memcpy(batch->flags, flags, count * sizeof(rk_instance_flags));
-    }
-    if (meshes) {
-        memcpy(batch->meshes, meshes, count * sizeof(rk_mesh));
-    }
-    if (need_params && got_params) {
-        rk_ubyte const * const * src = params;
-        for (rk_parameter const * dst = batch->params; dst < batch->params + batch->nparams; ++dst, ++src) {
-            dst->dirty = (*src || need_sorting);
-            if (*src) {
-                memcpy(dst->source, *src, count * dst->src_size);
-            }
-        }
-    }
-    if (need_sorting) {
-        batch->state = RK_BATCH_STATE_FILLED;
-    } else {
-        batch->state = RK_BATCH_STATE_SORTED;
-    }
-}
-
-static void rk_sort_batch(
-    rk_batch const & batch) {
-    rk_instance_flags const * flags = batch.flags;
-    rk_ushort * indices = batch.indices;
-    for (unsigned index = 0; index < batch.count; ++index, ++flags) {
-        if ((*flags & RK_INSTANCE_FLAGS_SPAWNED_VISIBLE) == RK_INSTANCE_FLAGS_SPAWNED_VISIBLE) {
-            *indices++ = index;
-        }
-    }
-    batch.ninstances = indices - batch.indices;
-    batch.ncommands = 0;
-    if (batch.ninstances) {
-        rk_command * const last_command = batch.commands + batch.max_meshes;
-        rk_command * command = batch.commands;
-        rk_ushort * base = batch.indices;
-        rk_ushort * const last = batch.indices + batch.ninstances;
-        for (rk_ushort * first = batch.indices; first < last && command < last_command; base = first, ++command) {
-            rk_mesh const & mesh = batch.meshes[*first++];
-            for ( ; first < last && batch.meshes[*first].packed == mesh.packed; ++first) {
-            }
-            for (rk_ushort * second = first; second < last; ++second) {
-                unsigned const index = *second;
-                if (batch.meshes[index].packed == mesh.packed) {
-                    *second = *first;
-                    *first++ = index;
-                }
-            }
-            command->nvertices = static_cast<GLuint>(mesh.ntriangles) * 3;
-            command->ninstances = first - base;
-            command->base_index = mesh.base_index;
-            command->base_vertex = 0;
-            command->base_instance = base - batch.indices;
-        }
-        batch.ncommands = command - batch.commands;
-    }
-    if (batch.nparams) {
-        batch.state = RK_BATCH_STATE_SORTED;
-    } else {
-        batch.state = RK_BATCH_STATE_PACKED;
-    }
-}
-
-static void rk_pack_batch(
-    rk_batch const & batch) {
-    for (rk_parameter const * param = batch.params; param < batch.params + batch.nparams; ++param) {
-        if (param->dirty) {
-            param->dirty = false;
-            if (batch.ninstances) {
-                rk_ubyte * const dst = reinterpret_cast<rk_ubyte *>(
-                    glMapBufferRange(GL_ARRAY_BUFFER, param->offset, batch.ninstances * param->dst_size,
-                        GL_MAP_WRITE_BIT | GL_MAP_INVALIDATE_RANGE_BIT | GL_MAP_UNSYNCHRONIZED_BIT));
-                if (dst) {
-                    param->packer(batch.ninstances, batch.indices, dst, param->source);
-                    glUnmapBuffer(GL_ARRAY_BUFFER);
-                }
-            }
-        }
-    }
-    batch.state = RK_BATCH_STATE_PACKED;
-}
-
 void rk_draw_batch(
    rk_batch_t _batch) {
-    rk_batch * const batch = reinterpret_cast<rk_batch *>(_batch);
+    rk_batch const * const batch = reinterpret_cast<rk_batch const *>(_batch);
    if (!batch) {
        rk_printf("rk_draw_batch(): invalid params.");
        return;
@ -896,50 +865,49 @@ void rk_draw_batch(
        rk_printf("rk_draw_batch(): invalid state.");
        return;
    }
-    if (batch->state < RK_BATCH_STATE_SORTED) {
-        rk_sort_batch(*batch);
-    }
-    if (!batch->ncommands) {
-        return;
-    }
    glBindVertexArray(batch->vertex_array);
    if (rk_MultiDrawElementsIndirect) {
        glBindBuffer(GL_DRAW_INDIRECT_BUFFER, batch->commands_buffer);
-        glBufferSubData(GL_DRAW_INDIRECT_BUFFER, 0, batch->ncommands * sizeof(rk_command), batch->commands);
    }
-    if (batch->state < RK_BATCH_STATE_PACKED) {
-        glBindBuffer(GL_ARRAY_BUFFER, batch->params_buffer);
+    if (batch->state < RK_BATCH_STATE_SORTED) {
+        rk_sort_batch(*batch);
+    }
+    if (batch->state < RK_BATCH_STATE_PACKED && batch->nparams) {
        rk_pack_batch(*batch);
-        glBindBuffer(GL_ARRAY_BUFFER, 0);
    }
-    rk_command const * const last_command = batch->commands + batch->ncommands;
-    if (rk_DrawElementsInstancedBaseInstance) {
-        if (rk_MultiDrawElementsIndirect) {
-            rk_MultiDrawElementsIndirect(
-                GL_TRIANGLES, GL_UNSIGNED_SHORT, nullptr, batch->ncommands, sizeof(rk_command));
-            glBindBuffer(GL_DRAW_INDIRECT_BUFFER, 0);
+    if (batch->ncommands) {
+        if (rk_DrawElementsInstancedBaseInstance) {
+            if (rk_MultiDrawElementsIndirect) {
+                rk_MultiDrawElementsIndirect(
+                    GL_TRIANGLES, GL_UNSIGNED_SHORT, nullptr, batch->ncommands, sizeof(rk_command));
+            } else {
+                rk_command const * const last_command = batch->commands + batch->ncommands;
+                for (rk_command const * command = batch->commands; command < last_command; ++command) {
+                    rk_DrawElementsInstancedBaseInstance(
+                        GL_TRIANGLES, command->nvertices, GL_UNSIGNED_SHORT,
+                        reinterpret_cast<void const *>(command->base_index << 1),
+                        command->ninstances, command->base_instance);
+                }
+            }
        } else {
+            rk_command const * const last_command = batch->commands + batch->ncommands;
+            rk_parameter const * const last_param = batch->params + batch->nparams;
+            unsigned param_index = 0;
            for (rk_command const * command = batch->commands; command < last_command; ++command) {
-                rk_DrawElementsInstancedBaseInstance(
+                for (rk_parameter const * param = batch->params; param < last_param; ++param) {
+                    glBindVertexBuffer(param->binding, batch->params_buffer,
+                        param->offset + param_index * param->dst_size, param->dst_size);
+                }
+                glDrawElementsInstanced(
                    GL_TRIANGLES, command->nvertices, GL_UNSIGNED_SHORT,
                    reinterpret_cast<void const *>(command->base_index << 1),
-                    command->ninstances, command->base_instance);
+                    command->ninstances);
+                param_index += command->ninstances;
            }
        }
-    } else {
-        unsigned param_index = 0;
-        rk_parameter const * const last_param = batch->params + batch->nparams;
-        for (rk_command const * command = batch->commands; command < last_command; ++command) {
-            for (rk_parameter const * param = batch->params; param < last_param; ++param) {
-                glBindVertexBuffer(param->binding, batch->params_buffer,
-                    param->offset + param_index * param->dst_size, param->dst_size);
-            }
-            param_index += command->ninstances;
-            glDrawElementsInstanced(
-                GL_TRIANGLES, command->nvertices, GL_UNSIGNED_SHORT,
-                reinterpret_cast<void const *>(command->base_index << 1),
-                command->ninstances);
-        }
+    }
+    if (rk_MultiDrawElementsIndirect) {
+        glBindBuffer(GL_DRAW_INDIRECT_BUFFER, 0);
    }
    glBindVertexArray(0);
 }
@ -981,8 +949,6 @@ void rk_destroy_batch(
            delete[] batch->params;
            glDeleteBuffers(1, &batch->params_buffer);
        }
-        glDeleteBuffers(1, &batch->indices_buffer);
-        glDeleteBuffers(1, &batch->vertices_buffer);
        glDeleteVertexArrays(1, &batch->vertex_array);
        delete batch;
    }
@ -1005,6 +971,12 @@ void rk_destroy_vertices(
        delete[] vertices->format;
        delete[] vertices->vertices;
        delete[] vertices->indices;
+        if (vertices->vertices_buffer) {
+            glDeleteBuffers(1, &vertices->vertices_buffer);
+        }
+        if (vertices->indices_buffer) {
+            glDeleteBuffers(1, &vertices->indices_buffer);
+        }
        delete vertices;
    }
 }
--- a/cpp/render/render_opengles.hpp
+++ b/cpp/render/render_opengles.hpp
@ -16,11 +16,13 @@
 #ifndef _RK_ENGINE_RENDER_OPENGLES_H
 #define _RK_ENGINE_RENDER_OPENGLES_H

-#include "../types.hpp"
+#include "../render.hpp"
 #include <GLES3/gl32.h>
 #include <GLES3/gl3ext.h>
 #include <GLES3/gl3platform.h>

+static_assert(sizeof(rk_vertex_output) == 4);
+
 struct rk_shader {
    GLuint vertex;
    GLuint fragment;
@ -41,9 +43,13 @@ struct rk_triangles {
 struct rk_vertices {
    unsigned nvertices;
    unsigned nindices;
+    unsigned nmeshes;
    rk_vertex_format * format;
    rk_ubyte * vertices;
-    rk_ushort * indices;
+    rk_vertex_index * indices;
+    rk_mesh * meshes;
+    GLuint vertices_buffer;
+    GLuint indices_buffer;
 };

 struct rk_command {
@ -54,50 +60,30 @@ struct rk_command {
    GLuint base_instance;
 };

-struct rk_vec3_float {
-    float x;
-    float y;
-    float z;
-};
-
-struct rk_vec3_short {
-    rk_short x;
-    rk_short y;
-    rk_short z;
-    rk_short pad;
-};
-
-typedef rk_int rk_vec3_int10;
-typedef rk_uint rk_vec3_uint10;
-
-struct rk_mat3_float {
-    rk_vec3_float x;
-    rk_vec3_float y;
-    rk_vec3_float z;
-};
-
-struct rk_mat3_int10 {
-    rk_vec3_int10 x;
-    rk_vec3_int10 y;
-    rk_vec3_int10 z;
-};
-
 typedef void (*rk_packer)(
    unsigned const, // count
-    rk_ushort const * const, // indices
-    rk_ubyte *, // dst
-    rk_ubyte const * const); // src
+    rk_instance_index const * const __restrict, // indices
+    rk_vertex_output * __restrict, // dst
+    rk_vertex_input const * const __restrict); // src

 struct rk_parameter {
    mutable bool dirty;
    unsigned binding;
    unsigned offset;
    unsigned src_size;
+    unsigned src_len;
    unsigned dst_size;
-    rk_ubyte * source;
+    unsigned dst_len;
+    rk_vertex_input * source;
    rk_packer packer;
 };

+struct rk_bucket {
+    unsigned size;
+    unsigned count;
+    rk_instance_index * indices;
+};
+
 enum rk_batch_state {
    RK_BATCH_STATE_EMPTY = 0,
    RK_BATCH_STATE_FILLED = 1,
@ -111,16 +97,14 @@ struct rk_batch {
    mutable unsigned ninstances;
    mutable unsigned ncommands;
    unsigned max_size;
-    unsigned max_meshes;
    unsigned nparams;
+    rk_vertices const * vertices;
    rk_instance_flags * flags;
-    rk_mesh * meshes;
-    rk_ushort * indices;
+    rk_mesh_index * meshes;
+    rk_instance_index * indices;
    rk_command * commands;
    rk_parameter * params;
    GLuint vertex_array;
-    GLuint vertices_buffer;
-    GLuint indices_buffer;
    GLuint commands_buffer;
    GLuint params_buffer;
 };
--- a/cpp/types.hpp
+++ b/cpp/types.hpp
@ -20,6 +20,11 @@
 #include <cstdint>

 #define RK_EXPORT extern "C"
+#define RK_HOT gnu::hot
+#define RK_FLATTEN gnu::flatten
+#define RK_UNROLLED gnu::optimize("unroll-loops")
+#define RK_FAST gnu::optimize("Ofast")
+#define RK_INVALID_HANDLE nullptr
 #define RK_FLAG(_bit) (1 << (_bit))

 typedef bool rk_bool;
@ -33,9 +38,33 @@ typedef int32_t rk_int;
 typedef uint32_t rk_uint;
 typedef int64_t rk_long;
 typedef uint64_t rk_ulong;
+typedef __int128 rk_llong;
+typedef unsigned __int128 rk_ullong;
 typedef float rk_float;
 typedef void * rk_handle_t;

-#define RK_INVALID_HANDLE nullptr
+static_assert(sizeof(rk_char) == 1);
+static_assert(sizeof(rk_wchar) == 4);
+static_assert(sizeof(rk_float) == 4);
+
+#pragma pack(push, 4)
+
+template<bool _signed, unsigned _cols>
+struct rk_packed {
+};
+
+template<unsigned _cols>
+struct alignas(alignof(rk_int)) rk_packed<true, _cols> {
+    typedef rk_int type;
+    rk_int packed;
+};
+
+template<unsigned _cols>
+struct alignas(alignof(rk_uint)) rk_packed<false, _cols> {
+    typedef rk_uint type;
+    rk_uint packed;
+};
+
+#pragma pack(pop)

 #endif // _RK_ENGINE_TYPES_H
--- a/cpp/utils/cmp_memcpy.hpp
+++ b/cpp/utils/cmp_memcpy.hpp
@ -0,0 +1,102 @@
+// Copyright (C) 2023 RozK
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Affero General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU Affero General Public License for more details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+#ifndef RK_ENGINE_CMP_MEMCPY_H
+#define RK_ENGINE_CMP_MEMCPY_H
+
+#include "../types.hpp"
+
+template<typename _small>
+[[RK_FAST]]
+inline bool _rk_cmp_memcpy_small(
+    _small * __restrict dst,
+    _small const * __restrict src,
+    unsigned count) {
+    _small cmp = 0;
+    do {
+        cmp |= *dst ^ *src;
+        *dst++ = *src++;
+    } while(--count > 0);
+    return (cmp != 0);
+}
+
+template<typename _big, typename _small>
+[[RK_FAST, RK_FLATTEN]]
+inline bool _rk_cmp_memcpy_big(
+    _small * const __restrict _dst,
+    _small const * const __restrict _src,
+    unsigned const _count) {
+    unsigned const ratio = sizeof(_big) / sizeof(_small);
+    unsigned big_count = _count / ratio;
+    unsigned const small_count = _count % ratio;
+    _big * dst = reinterpret_cast<_big *>(_dst);
+    _big const * src = reinterpret_cast<_big const *>(_src);
+    _big cmp = 0;
+    do {
+        cmp |= *dst ^ *src;
+        *dst++ = *src++;
+    } while(--big_count > 0);
+    bool modified = (cmp != 0);
+    if (small_count) {
+        modified |= _rk_cmp_memcpy_small<_small>(
+            reinterpret_cast<_small *>(dst), reinterpret_cast<_small const *>(src), small_count);
+    }
+    return modified;
+}
+
+#ifdef RK_CMP_MEMCPY_UNALIGNED
+#define _rk_count_and_alignment(_t) (count >= (sizeof(_t) / sizeof(_small)))
+#else
+#define _rk_count_and_alignment(_t) ((count >= (sizeof(_t) / sizeof(_small))) && !(alignment % sizeof(_t)))
+#endif
+
+template<typename _small>
+[[RK_HOT, RK_FAST, RK_FLATTEN]]
+bool rk_cmp_memcpy(
+    _small * const __restrict _dst,
+    _small const * const __restrict _src,
+    unsigned const count) {
+#ifndef RK_CMP_MEMCPY_UNALIGNED
+    unsigned const alignment = reinterpret_cast<uintptr_t>(_dst) | reinterpret_cast<uintptr_t const>(_src);
+#endif
+    if (sizeof(_small) < sizeof(rk_ullong)) {
+        if (_rk_count_and_alignment(rk_ullong)) {
+            return _rk_cmp_memcpy_big<rk_ullong, _small>(_dst, _src, count);
+        }
+    }
+    if (sizeof(_small) < sizeof(rk_ulong)) {
+        if (_rk_count_and_alignment(rk_ulong)) {
+            return _rk_cmp_memcpy_big<rk_ulong, _small>(_dst, _src, count);
+        }
+    }
+    if (sizeof(_small) < sizeof(rk_uint)) {
+        if (_rk_count_and_alignment(rk_uint)) {
+            return _rk_cmp_memcpy_big<rk_uint, _small>(_dst, _src, count);
+        }
+    }
+    if (sizeof(_small) < sizeof(rk_ushort)) {
+        if (_rk_count_and_alignment(rk_ushort)) {
+            return _rk_cmp_memcpy_big<rk_ushort, _small>(_dst, _src, count);
+        }
+    }
+    if (count) {
+        return _rk_cmp_memcpy_small<_small>(_dst, _src, count);
+    }
+    return false;
+}
+
+#undef _rk_count_and_alignment
+
+#endif // RK_ENGINE_CMP_MEMCPY_H
--- a/cpp/utils/vertex_format.hpp
+++ b/cpp/utils/vertex_format.hpp
@ -0,0 +1,258 @@
+// Copyright (C) 2023 RozK
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Affero General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU Affero General Public License for more details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+#ifndef RK_ENGINE_VERTEX_FORMAT_H
+#define RK_ENGINE_VERTEX_FORMAT_H
+
+#include "../render.hpp"
+#include <limits>
+
+namespace rk_vertex {
+
+#pragma pack(push, 1)
+
+template<typename _type>
+struct alignas(alignof(_type)) rk_input {
+    _type input;
+};
+
+template<typename _type, typename _input, bool _signed, bool _normalized>
+struct alignas(alignof(_type)) rk_output {
+    _type output;
+
+    [[RK_FAST]]
+    inline void convert(
+        rk_input<_input> const & __restrict src) {
+        output = static_cast<_type>(src.input);
+    }
+};
+
+template<typename _type, bool _signed>
+struct alignas(alignof(_type)) rk_output<_type, rk_float, _signed, true> {
+    _type output;
+
+    [[RK_FAST]]
+    inline void convert(
+        rk_input<rk_float> const & __restrict src) {
+        enum : _type { max = std::numeric_limits<_type>::max() };
+        output = static_cast<_type>(src.input * static_cast<float>(max));
+    }
+};
+
+#pragma pack(4)
+
+template<typename _input, unsigned _cols>
+struct alignas(4) rk_input_row {
+    rk_input<_input> input_col[_cols];
+};
+
+template<typename _output, typename _input, unsigned _cols, bool _signed, bool _normalized>
+struct alignas(4) rk_output_row {
+    rk_output<_output, _input, _signed, _normalized> output_col[_cols];
+
+    [[RK_FAST, RK_FLATTEN, RK_UNROLLED]]
+    inline void convert(
+        rk_input_row<_input, _cols> const & __restrict src) {
+        for (unsigned col = 0; col < _cols; ++col) {
+            output_col[col].convert(src.input_col[col]);
+        }
+    }
+};
+
+template<typename _output, unsigned _cols, bool _signed, bool _normalized>
+struct alignas(4) rk_output_row<_output, _output, _cols, _signed, _normalized> {
+    rk_output<_output, _output, _signed, _normalized> output_col[_cols];
+
+    [[RK_FAST]]
+    inline void convert(
+        rk_input_row<_output, _cols> const & __restrict src) {
+        static_assert(sizeof(output_col) == sizeof(src.input_col));
+        rk_output<_output, _output, _signed, _normalized> const * const input_col =
+            reinterpret_cast<rk_output<_output, _output, _signed, _normalized> const *>(src.input_col);
+        *output_col = *input_col;
+    }
+};
+
+template<typename _input, unsigned _cols, bool _signed, bool _normalized>
+struct alignas(4) rk_output_row<rk_packed<_signed, _cols>, _input, _cols, _signed, _normalized> {
+    rk_output<rk_packed<_signed, _cols>, _input, _signed, _normalized> output_cols;
+
+    [[RK_FAST, RK_FLATTEN]]
+    inline void convert(
+        rk_input_row<_input, _cols> const & __restrict src) {
+        output_cols.convert(src);
+    }
+};
+
+template<typename _input, unsigned _cols, unsigned _rows>
+struct alignas(4) rk_input_format {
+    rk_input_row<_input, _cols> input_row[_rows];
+};
+
+template<typename _output, typename _input, unsigned _cols, unsigned _rows, bool _signed, bool _normalized>
+struct alignas(4) rk_output_format {
+    rk_output_row<_output, _input, _cols, _signed, _normalized> output_row[_rows];
+
+    [[RK_FAST, RK_FLATTEN, RK_UNROLLED]]
+    inline void convert(
+        rk_input_format<_input, _cols, _rows> const & __restrict src) {
+        for (unsigned row = 0; row < _rows; ++row) {
+            output_row[row].convert(src.input_row[row]);
+        }
+    }
+};
+
+template<typename _output, unsigned _cols, unsigned _rows, bool _signed, bool _normalized>
+struct alignas(4) rk_output_format<_output, _output, _cols, _rows, _signed, _normalized> {
+    rk_output_row<_output, _output, _cols, _signed, _normalized> output_row[_rows];
+
+    [[RK_FAST]]
+    inline void convert(
+        rk_input_format<_output, _cols, _rows> const & __restrict src) {
+        static_assert(sizeof(output_row) == sizeof(src.input_row));
+        rk_output_row<_output, _output, _cols, _signed, _normalized> const * const input_row =
+            reinterpret_cast<rk_output_row<_output, _output, _cols, _signed, _normalized> const *>(src.input_row);
+        *output_row = *input_row;
+    }
+};
+
+template<typename _input, bool _signed, bool _normalized>
+struct alignas(alignof(rk_packed<_signed, 3>)) rk_output<rk_packed<_signed, 3>, _input, _signed, _normalized> {
+    rk_packed<_signed, 3> output;
+
+    [[RK_FAST]]
+    inline void convert(
+        rk_input_row<_input, 3> const & __restrict src) {
+        typedef typename rk_packed<_signed, 3>::type packed_type;
+        output.packed =
+            ((static_cast<packed_type>(src.input_col[0].input) & 1023)) |
+            ((static_cast<packed_type>(src.input_col[1].input) & 1023) << 10) |
+            ((static_cast<packed_type>(src.input_col[2].input) & 1023) << 20);
+    }
+};
+
+template<typename _input, bool _signed, bool _normalized>
+struct alignas(alignof(rk_packed<_signed, 4>)) rk_output<rk_packed<_signed, 4>, _input, _signed, _normalized> {
+    rk_packed<_signed, 4> output;
+
+    [[RK_FAST]]
+    inline void convert(
+        rk_input_row<_input, 4> const & __restrict src) {
+        typedef typename rk_packed<_signed, 4>::type packed_type;
+        output.packed =
+            ((static_cast<packed_type>(src.input_col[0].input) & 1023)) |
+            ((static_cast<packed_type>(src.input_col[1].input) & 1023) << 10) |
+            ((static_cast<packed_type>(src.input_col[2].input) & 1023) << 20) |
+            ((static_cast<packed_type>(src.input_col[3].input) &    3) << 30);
+    }
+};
+
+template<>
+struct alignas(alignof(rk_packed<true, 3>)) rk_output<rk_packed<true, 3>, rk_float, true, true> {
+    rk_packed<true, 3> output;
+
+    [[RK_FAST]]
+    inline void convert(
+        rk_input_row<rk_float, 3> const & __restrict src) {
+        output.packed =
+            ((static_cast<rk_int>(src.input_col[0].input * 511.f) & 1023)) |
+            ((static_cast<rk_int>(src.input_col[1].input * 511.f) & 1023) << 10) |
+            ((static_cast<rk_int>(src.input_col[2].input * 511.f) & 1023) << 20);
+    }
+};
+
+template<>
+struct alignas(alignof(rk_packed<true, 4>)) rk_output<rk_packed<true, 4>, rk_float, true, true> {
+    rk_packed<true, 4> output;
+
+    [[RK_FAST]]
+    inline void convert(
+        rk_input_row<rk_float, 4> const & __restrict src) {
+        output.packed =
+            ((static_cast<rk_uint>(src.input_col[0].input * 511.f) & 1023)) |
+            ((static_cast<rk_uint>(src.input_col[1].input * 511.f) & 1023) << 10) |
+            ((static_cast<rk_uint>(src.input_col[2].input * 511.f) & 1023) << 20) |
+            ((static_cast<rk_uint>(src.input_col[3].input)         &    3) << 30);
+    }
+};
+
+#pragma pack(pop)
+
+template<typename _output, typename _input, unsigned _cols, unsigned _rows, bool _signed, bool _normalized>
+struct rk_format {
+    typedef rk_input<_input> input;
+    typedef rk_output<_output, input, _signed, _normalized> output;
+
+    typedef rk_input_row<_input, _cols> input_row;
+    typedef rk_output_row<_output, _input, _cols, _signed, _normalized> output_row;
+
+    typedef rk_input_format<_input, _cols, _rows> input_format;
+    typedef rk_output_format<_output, _input, _cols, _rows, _signed, _normalized> output_format;
+
+    static_assert(sizeof(input) == sizeof(_input));
+    static_assert(sizeof(output) == sizeof(_output));
+    static_assert((sizeof(input_row) % sizeof(rk_vertex_input)) == 0);
+    static_assert((sizeof(output_row) % sizeof(rk_vertex_output)) == 0);
+    static_assert((sizeof(input_format) % sizeof(rk_vertex_input)) == 0);
+    static_assert((sizeof(output_format) % sizeof(rk_vertex_output)) == 0);
+
+    static unsigned get_input_size() {
+        return sizeof(input_format);
+    }
+
+    static unsigned get_output_size() {
+        return sizeof(output_format);
+    }
+
+    static unsigned get_output_offset(unsigned const index) {
+        return index * sizeof(output_row);
+    }
+
+    [[RK_FAST, RK_FLATTEN]]
+    inline static void convert(
+        output_format & __restrict dst,
+        input_format const & __restrict src) {
+        dst.convert(src);
+    }
+
+    [[RK_HOT, RK_FAST, RK_FLATTEN]]
+    static void param_packer(
+        unsigned const count,
+        rk_instance_index const * const __restrict indices,
+        rk_vertex_output * __restrict _dst,
+        rk_vertex_input const * const __restrict _src) {
+        rk_instance_index const * const last_index = indices + count;
+        output_format * __restrict dst = reinterpret_cast<output_format *>(_dst);
+        input_format const * const __restrict src = reinterpret_cast<input_format const *>(_src);
+        for (rk_instance_index const * __restrict index = indices; index < last_index; ++index, ++dst) {
+            dst->convert(src[*index]);
+        }
+    }
+};
+
+} // namepace rk_vertex
+
+typedef rk_vertex::rk_format<rk_float,            rk_float, 3, 1, true,  false> rk_vec3_float;
+typedef rk_vertex::rk_format<rk_short,            rk_float, 3, 1, true,  false> rk_vec3_short;
+typedef rk_vertex::rk_format<rk_short,            rk_float, 3, 1, true,  true>  rk_vec3_short_norm;
+typedef rk_vertex::rk_format<rk_packed<true,  3>, rk_float, 3, 1, true,  false> rk_vec3_int10;
+typedef rk_vertex::rk_format<rk_packed<true,  3>, rk_float, 3, 1, true,  true>  rk_vec3_int10_norm;
+typedef rk_vertex::rk_format<rk_packed<false, 3>, rk_float, 3, 1, false, false> rk_vec3_uint10;
+typedef rk_vertex::rk_format<rk_packed<false, 3>, rk_float, 3, 1, false, true>  rk_vec3_uint10_norm;
+typedef rk_vertex::rk_format<rk_float,            rk_float, 3, 3, true,  false> rk_mat3_float;
+typedef rk_vertex::rk_format<rk_packed<true,  3>, rk_float, 3, 3, true,  false> rk_mat3_int10;
+typedef rk_vertex::rk_format<rk_packed<true,  3>, rk_float, 3, 3, true,  true>  rk_mat3_int10_norm;
+
+#endif // RK_ENGINE_VERTEX_FORMAT_H
Author	SHA1	Message	Date
Roz K	7d35ac0e5b	Move vertex format to utils.	2023-01-07 06:30:08 +01:00
Roz K	7384a014ff	Improve enum declaration.	2023-01-06 18:42:58 +01:00
Roz K	cb763962fd	Merge vertex and param formats.	2023-01-06 18:40:58 +01:00
Roz K	39c449a763	Add direct copy when output and input vertex formats are matching.	2023-01-06 18:16:10 +01:00
Roz K	a02d8e4d7d	Fix unsigned packed vertex format.	2023-01-06 18:15:19 +01:00
Roz K	81d52086fe	Rework params packing.	2023-01-06 17:05:09 +01:00
Roz K	16c7c91508	Add function attributes to cmp_memcpy.	2023-01-06 17:03:21 +01:00
Roz K	9181d58ecd	Add template for packed ints and fix missing define.	2023-01-06 17:02:24 +01:00
Roz K	a6ec35ebd1	Remove heavy optims flags from makefile and add predefined function attributes instead.	2023-01-06 17:00:11 +01:00
Roz K	596caef7ee	Move cmp_memcpy into directory utils.	2023-01-05 03:39:32 +01:00
Roz K	f463db316f	Fix signed integers normalisation.	2023-01-04 15:24:18 +01:00
Roz K	59d13684be	Improve typing in render.	2023-01-04 12:41:05 +01:00
Roz K	d6ec77207f	Cleanup includes.	2023-01-04 10:22:12 +01:00
Roz K	b46b4bddba	Improve typing in render.	2023-01-04 09:33:49 +01:00
Roz K	ebc6ededf3	Improve mesh struct after switching to mesh indices.	2023-01-04 09:06:13 +01:00
Roz K	74b6f58794	Fix redondant modification test in batch sorting.	2023-01-03 21:41:03 +01:00
Roz K	39a95e24c3	Switch to buckets sorting.	2023-01-03 21:31:36 +01:00
Roz K	3e0ea2560a	Improve compare-memcopy.	2023-01-03 20:50:18 +01:00
Roz K	558ec08614	Buckets sorting.	2023-01-03 16:06:11 +01:00
Roz K	211762c279	Remove obvious flags from makefile.	2023-01-03 16:05:37 +01:00
Roz K	a5adfacdfd	Use mesh indices in batch.	2023-01-03 14:05:25 +01:00
Roz K	d0741afda7	Stores meshes into vertices.	2023-01-03 12:59:07 +01:00
Roz K	66980e6ea9	Move vertices and indicies buffers to vertices.	2023-01-03 11:38:19 +01:00
Roz K	a91a852887	Automatic batch freezing.	2023-01-03 11:10:37 +01:00
Roz K	357066b315	Cleanup makefile.	2023-01-03 11:10:14 +01:00
Roz K	baac333b44	Batch freezing.	2023-01-03 05:28:24 +01:00