mirror of https://gitee.com/namelin2022/ollama
committed by
GitHub
67 changed files with 785 additions and 4354 deletions
@ -0,0 +1,160 @@ |
|||
package convert |
|||
|
|||
import ( |
|||
"strings" |
|||
|
|||
"github.com/ollama/ollama/fs/ggml" |
|||
"github.com/pdevine/tensor" |
|||
"github.com/pdevine/tensor/native" |
|||
) |
|||
|
|||
type mllamaModel struct { |
|||
ModelParameters |
|||
TextModel struct { |
|||
llamaModel |
|||
|
|||
CrossAttentionLayers []int32 `json:"cross_attention_layers"` |
|||
} `json:"text_config"` |
|||
VisionModel struct { |
|||
NumHiddenLayers uint32 `json:"num_hidden_layers"` |
|||
NumGlobalLayers uint32 `json:"num_global_layers"` |
|||
IntermediateLayersIndices []int32 `json:"intermediate_layers_indices"` |
|||
|
|||
HiddenSize uint32 `json:"hidden_size"` |
|||
IntermediateSize uint32 `json:"intermediate_size"` |
|||
|
|||
AttentionHeads uint32 `json:"attention_heads"` |
|||
|
|||
ImageSize uint32 `json:"image_size"` |
|||
PatchSize uint32 `json:"patch_size"` |
|||
NumChannels uint32 `json:"num_channels"` |
|||
MaxNumTiles uint32 `json:"max_num_tiles"` |
|||
NormEpsilon float32 `json:"norm_eps"` |
|||
RopeTheta float32 `json:"rope.freq_base"` |
|||
} `json:"vision_config"` |
|||
} |
|||
|
|||
func (m *mllamaModel) KV(t *Tokenizer) ggml.KV { |
|||
kv := m.ModelParameters.KV(t) |
|||
kv["general.architecture"] = "mllama" |
|||
|
|||
for k, v := range m.TextModel.KV(t) { |
|||
if strings.HasPrefix(k, "llama.") { |
|||
kv[strings.ReplaceAll(k, "llama.", "mllama.")] = v |
|||
} |
|||
} |
|||
|
|||
kv["mllama.attention.cross_attention_layers"] = m.TextModel.CrossAttentionLayers |
|||
|
|||
kv["mllama.vision.block_count"] = m.VisionModel.NumHiddenLayers |
|||
kv["mllama.vision.global.block_count"] = m.VisionModel.NumGlobalLayers |
|||
kv["mllama.vision.intermediate_layers_indices"] = m.VisionModel.IntermediateLayersIndices |
|||
|
|||
kv["mllama.vision.embedding_length"] = m.VisionModel.HiddenSize |
|||
kv["mllama.vision.feed_forward_length"] = m.VisionModel.IntermediateSize |
|||
|
|||
kv["mllama.vision.attention.head_count"] = m.VisionModel.AttentionHeads |
|||
kv["mllama.vision.attention.layer_norm_epsilon"] = m.VisionModel.NormEpsilon |
|||
|
|||
kv["mllama.vision.image_size"] = m.VisionModel.ImageSize |
|||
kv["mllama.vision.patch_size"] = m.VisionModel.PatchSize |
|||
kv["mllama.vision.max_num_tiles"] = m.VisionModel.MaxNumTiles |
|||
kv["mllama.vision.num_channels"] = m.VisionModel.NumChannels |
|||
|
|||
return kv |
|||
} |
|||
|
|||
func (m *mllamaModel) Replacements() []string { |
|||
return append( |
|||
m.TextModel.Replacements(), |
|||
"language_model.", "", |
|||
"gate_attn", "attn_gate", |
|||
"gate_ffn", "ffn_gate", |
|||
"cross_attn.", "cross_attn_", |
|||
"vision_model", "v", |
|||
"class_embedding", "class_embd", |
|||
"patch_embedding", "patch_embd", |
|||
"gated_positional_embedding.tile_embedding", "tile_position_embd", |
|||
"gated_positional_embedding.embedding", "position_embd.weight", |
|||
"gated_positional_embedding", "position_embd", |
|||
"embedding.weight", "weight", |
|||
"pre_tile_positional_embedding", "pre_tile_position_embd", |
|||
"post_tile_positional_embedding", "post_tile_position_embd", |
|||
"layernorm_pre", "pre_ln", |
|||
"layernorm_post", "post_ln", |
|||
"global_transformer.layers", "global.blk", |
|||
"transformer.layers", "blk", |
|||
"mlp.fc1", "ffn_up", |
|||
"mlp.fc2", "ffn_down", |
|||
"multi_modal_projector", "mm.0", |
|||
) |
|||
} |
|||
|
|||
func (m *mllamaModel) Tensors(ts []Tensor) []*ggml.Tensor { |
|||
var out []*ggml.Tensor |
|||
var text []Tensor |
|||
for _, t := range ts { |
|||
if t.Name() == "v.position_embd.gate" { |
|||
for _, name := range []string{"v.position_embd.gate", "v.tile_position_embd.gate"} { |
|||
tt := t.Clone() |
|||
tt.SetRepacker(m.repack(name)) |
|||
out = append(out, &ggml.Tensor{ |
|||
Name: name, |
|||
Kind: t.Kind(), |
|||
Shape: t.Shape(), |
|||
WriterTo: tt, |
|||
}) |
|||
} |
|||
} else if t.Name() == "v.pre_tile_position_embd.gate" || t.Name() == "v.post_tile_position_embd.gate" { |
|||
t.SetRepacker(m.repack(t.Name())) |
|||
out = append(out, &ggml.Tensor{ |
|||
Name: t.Name(), |
|||
Kind: t.Kind(), |
|||
Shape: t.Shape(), |
|||
WriterTo: t, |
|||
}) |
|||
} else if strings.HasPrefix(t.Name(), "v.") || strings.HasPrefix(t.Name(), "mm.") { |
|||
out = append(out, &ggml.Tensor{ |
|||
Name: t.Name(), |
|||
Kind: t.Kind(), |
|||
Shape: t.Shape(), |
|||
WriterTo: t, |
|||
}) |
|||
} else { |
|||
text = append(text, t) |
|||
} |
|||
} |
|||
|
|||
return append(out, m.TextModel.Tensors(text)...) |
|||
} |
|||
|
|||
func (m *mllamaModel) repack(name string) Repacker { |
|||
return func(_ string, data []float32, shape []uint64) (_ []float32, err error) { |
|||
dims := make([]int, len(shape)) |
|||
for i, dim := range shape { |
|||
dims[i] = int(dim) |
|||
} |
|||
|
|||
var t tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data)) |
|||
|
|||
t, err = tensor.Tanh(t) |
|||
if err != nil { |
|||
return nil, err |
|||
} |
|||
|
|||
if name == "v.position_embd.gate" { |
|||
t, err = tensor.Sub(float32(1), t) |
|||
if err != nil { |
|||
return nil, err |
|||
} |
|||
} |
|||
|
|||
t = tensor.Materialize(t) |
|||
// flatten tensor so it can be return as a vector
|
|||
if err := t.Reshape(t.Shape().TotalSize()); err != nil { |
|||
return nil, err |
|||
} |
|||
|
|||
return native.VectorF32(t.(*tensor.Dense)) |
|||
} |
|||
} |
|||
@ -1,887 +0,0 @@ |
|||
// NOTE: This is modified from clip.cpp for Mllama only
|
|||
#include "mllama.h" |
|||
|
|||
#include "ggml-alloc.h" |
|||
#include "ggml-backend.h" |
|||
#include "ggml-cpu.h" |
|||
#include "ggml.h" |
|||
#include "gguf.h" |
|||
|
|||
#ifdef GGML_USE_CUDA |
|||
#include "ggml-cuda.h" |
|||
#endif |
|||
|
|||
#ifdef GGML_USE_METAL |
|||
#include "ggml-metal.h" |
|||
#endif |
|||
|
|||
#ifdef GGML_USE_CANN |
|||
#include "ggml-cann.h" |
|||
#endif |
|||
|
|||
#ifdef GGML_USE_VULKAN |
|||
#include "ggml-vulkan.h" |
|||
#endif |
|||
|
|||
#include <algorithm> |
|||
#include <cmath> |
|||
#include <cstdarg> |
|||
#include <cstdlib> |
|||
#include <cstring> |
|||
#include <fstream> |
|||
#include <stdexcept> |
|||
#include <vector> |
|||
|
|||
#define REQUIRE(x) \ |
|||
do { \ |
|||
if (!(x)) { \ |
|||
throw std::runtime_error("REQUIRE failed: " #x); \ |
|||
} \ |
|||
} while (0) |
|||
|
|||
#define LOG(fmt, ...) fprintf(stderr, "%s: " fmt "\n", __func__, ##__VA_ARGS__) |
|||
|
|||
#if defined(_WIN32) |
|||
#define WIN32_LEAN_AND_MEAN |
|||
#ifndef NOMINMAX |
|||
#define NOMINMAX |
|||
#endif |
|||
#include <windows.h> |
|||
#if __GLIBCXX__ |
|||
#include <cstdio> |
|||
#include <ext/stdio_filebuf.h> |
|||
#include <fcntl.h> |
|||
#endif |
|||
#endif |
|||
|
|||
struct mllama_image { |
|||
int width; |
|||
int height; |
|||
|
|||
int num_channels = 3; |
|||
int num_tiles = 4; |
|||
|
|||
int aspect_ratio_id; |
|||
|
|||
std::vector<float> data; |
|||
}; |
|||
|
|||
static std::string format(const char *fmt, ...) { |
|||
va_list args; |
|||
va_start(args, fmt); |
|||
std::vector<char> b(128); |
|||
int n = vsnprintf(b.data(), b.size(), fmt, args); |
|||
REQUIRE(n >= 0 && n < b.size()); |
|||
va_end(args); |
|||
return std::string(b.data(), b.size()); |
|||
} |
|||
|
|||
//
|
|||
// utilities to get data from a gguf file
|
|||
//
|
|||
|
|||
static int get_key_index(const gguf_context *ctx, const char *key) { |
|||
int key_index = gguf_find_key(ctx, key); |
|||
REQUIRE(key_index != -1); |
|||
return key_index; |
|||
} |
|||
|
|||
static std::vector<uint32_t> get_u32_array(const gguf_context *ctx, const std::string &key) { |
|||
const int i = get_key_index(ctx, key.c_str()); |
|||
const int n = gguf_get_arr_n(ctx, i); |
|||
const uint32_t *data = (uint32_t *)gguf_get_arr_data(ctx, i); |
|||
|
|||
std::vector<uint32_t> s(n); |
|||
for (size_t j = 0; j < s.size(); j++) { |
|||
s[j] = data[j]; |
|||
} |
|||
|
|||
return s; |
|||
} |
|||
|
|||
static uint32_t get_u32(const gguf_context *ctx, const std::string &key) { |
|||
return gguf_get_val_u32(ctx, get_key_index(ctx, key.c_str())); |
|||
} |
|||
|
|||
static float get_f32(const gguf_context *ctx, const std::string &key) { |
|||
return gguf_get_val_f32(ctx, get_key_index(ctx, key.c_str())); |
|||
} |
|||
|
|||
static std::string get_ftype(int ftype) { |
|||
return ggml_type_name(static_cast<ggml_type>(ftype)); |
|||
} |
|||
|
|||
//
|
|||
// mllama layers
|
|||
//
|
|||
|
|||
struct mllama_hparams { |
|||
uint32_t image_size; |
|||
uint32_t patch_size; |
|||
uint32_t hidden_size; |
|||
uint32_t n_intermediate; |
|||
uint32_t projection_dim; |
|||
uint32_t n_head; |
|||
uint32_t n_layer; |
|||
uint32_t n_global_layer; |
|||
uint32_t n_tiles; |
|||
|
|||
float eps; |
|||
|
|||
std::vector<bool> intermediate_layers; |
|||
}; |
|||
|
|||
struct mllama_layer { |
|||
// attention
|
|||
struct ggml_tensor *k_w; |
|||
struct ggml_tensor *k_b; |
|||
struct ggml_tensor *q_w; |
|||
struct ggml_tensor *q_b; |
|||
struct ggml_tensor *v_w; |
|||
struct ggml_tensor *v_b; |
|||
|
|||
struct ggml_tensor *o_w; |
|||
struct ggml_tensor *o_b; |
|||
|
|||
struct ggml_tensor *attn_gate; |
|||
|
|||
// layernorm 1
|
|||
struct ggml_tensor *ln_1_w; |
|||
struct ggml_tensor *ln_1_b; |
|||
|
|||
// ff
|
|||
struct ggml_tensor *ff_i_w; |
|||
struct ggml_tensor *ff_i_b; |
|||
|
|||
struct ggml_tensor *ff_o_w; |
|||
struct ggml_tensor *ff_o_b; |
|||
|
|||
struct ggml_tensor *ff_gate; |
|||
|
|||
// layernorm 2
|
|||
struct ggml_tensor *ln_2_w; |
|||
struct ggml_tensor *ln_2_b; |
|||
}; |
|||
|
|||
struct mllama_vision_model { |
|||
struct mllama_hparams hparams; |
|||
|
|||
// embeddings
|
|||
struct ggml_tensor *class_embedding; |
|||
struct ggml_tensor *patch_embeddings; |
|||
struct ggml_tensor *position_embeddings; |
|||
struct ggml_tensor *position_embeddings_gate; |
|||
struct ggml_tensor *tile_position_embeddings; |
|||
struct ggml_tensor *tile_position_embeddings_gate; |
|||
struct ggml_tensor *pre_tile_position_embeddings; |
|||
struct ggml_tensor *pre_tile_position_embeddings_gate; |
|||
struct ggml_tensor *post_tile_position_embeddings; |
|||
struct ggml_tensor *post_tile_position_embeddings_gate; |
|||
|
|||
struct ggml_tensor *pre_ln_w; |
|||
struct ggml_tensor *pre_ln_b; |
|||
|
|||
std::vector<mllama_layer> layers; |
|||
std::vector<mllama_layer> global_layers; |
|||
|
|||
struct ggml_tensor *post_ln_w; |
|||
struct ggml_tensor *post_ln_b; |
|||
|
|||
struct ggml_tensor *mm_0_w; |
|||
struct ggml_tensor *mm_0_b; |
|||
}; |
|||
|
|||
struct mllama_ctx { |
|||
struct mllama_vision_model vision_model; |
|||
|
|||
uint32_t ftype = 1; |
|||
|
|||
struct gguf_context *ctx_gguf; |
|||
struct ggml_context *ctx_data; |
|||
|
|||
std::vector<uint8_t> buf_compute_meta; |
|||
|
|||
// memory buffers to evaluate the model
|
|||
ggml_backend_buffer_t params_buffer = nullptr; |
|||
|
|||
ggml_backend_t backend = nullptr; |
|||
ggml_gallocr_t compute_alloc = nullptr; |
|||
}; |
|||
|
|||
static ggml_tensor *mllama_image_build_encoder_layer( |
|||
struct ggml_context *ctx0, const size_t il, const struct mllama_layer &layer, struct ggml_tensor *embeddings, |
|||
const float eps, const int hidden_size, const int batch_size, const int n_head, const int d_head) { |
|||
struct ggml_tensor *cur = embeddings; |
|||
|
|||
{ |
|||
// layernorm1
|
|||
cur = ggml_norm(ctx0, cur, eps); |
|||
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.ln_1_w), layer.ln_1_b); |
|||
ggml_set_name(cur, format("%d pre layernorm", il).c_str()); |
|||
} |
|||
|
|||
{ |
|||
// self-attention
|
|||
struct ggml_tensor *Q = ggml_mul_mat(ctx0, layer.q_w, cur); |
|||
if (layer.q_b != nullptr) { |
|||
Q = ggml_add(ctx0, Q, layer.q_b); |
|||
} |
|||
|
|||
Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, Q->ne[1], batch_size); |
|||
Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3)); |
|||
ggml_set_name(Q, format("%d query", il).c_str()); |
|||
|
|||
struct ggml_tensor *K = ggml_mul_mat(ctx0, layer.k_w, cur); |
|||
if (layer.k_b != nullptr) { |
|||
K = ggml_add(ctx0, K, layer.k_b); |
|||
} |
|||
|
|||
K = ggml_reshape_4d(ctx0, K, d_head, n_head, K->ne[1], batch_size); |
|||
K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3)); |
|||
ggml_set_name(K, format("%d key", il).c_str()); |
|||
|
|||
struct ggml_tensor *V = ggml_mul_mat(ctx0, layer.v_w, cur); |
|||
if (layer.v_b != nullptr) { |
|||
V = ggml_add(ctx0, V, layer.v_b); |
|||
} |
|||
|
|||
V = ggml_reshape_4d(ctx0, V, d_head, n_head, V->ne[1], batch_size); |
|||
V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3)); |
|||
ggml_set_name(V, format("%d value", il).c_str()); |
|||
|
|||
struct ggml_tensor *KQ = ggml_mul_mat(ctx0, K, Q); |
|||
KQ = ggml_scale_inplace(ctx0, KQ, 1.0f / sqrtf((float)d_head)); |
|||
KQ = ggml_soft_max_inplace(ctx0, KQ); |
|||
ggml_set_name(KQ, format("%d KQ", il).c_str()); |
|||
|
|||
struct ggml_tensor *KQV = ggml_mul_mat(ctx0, V, KQ); |
|||
KQV = ggml_reshape_4d(ctx0, KQV, d_head, KQV->ne[1], n_head, batch_size); |
|||
KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3); |
|||
KQV = ggml_cont_3d(ctx0, KQV, hidden_size, KQV->ne[2], batch_size); |
|||
ggml_set_name(KQV, format("%d KQV", il).c_str()); |
|||
|
|||
cur = ggml_mul_mat(ctx0, layer.o_w, KQV); |
|||
if (layer.o_b != nullptr) { |
|||
cur = ggml_add(ctx0, cur, layer.o_b); |
|||
} |
|||
ggml_set_name(cur, format("%d self attention", il).c_str()); |
|||
|
|||
if (layer.attn_gate != nullptr) { |
|||
cur = ggml_mul_inplace(ctx0, cur, layer.attn_gate); |
|||
ggml_set_name(cur, format("%d self attention gate", il).c_str()); |
|||
} |
|||
} |
|||
|
|||
cur = ggml_add(ctx0, cur, embeddings); |
|||
ggml_set_name(cur, format("%d residual", il).c_str()); |
|||
|
|||
embeddings = cur; |
|||
|
|||
{ |
|||
// layernorm2
|
|||
cur = ggml_norm(ctx0, cur, eps); |
|||
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.ln_2_w), layer.ln_2_b); |
|||
ggml_set_name(cur, format("%d post layernorm", il).c_str()); |
|||
} |
|||
|
|||
{ |
|||
// feed forward
|
|||
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.ff_i_w, cur), layer.ff_i_b); |
|||
cur = ggml_gelu_inplace(ctx0, cur); |
|||
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.ff_o_w, cur), layer.ff_o_b); |
|||
ggml_set_name(cur, format("%d feed forward", il).c_str()); |
|||
|
|||
if (layer.ff_gate != nullptr) { |
|||
cur = ggml_mul_inplace(ctx0, cur, layer.ff_gate); |
|||
ggml_set_name(cur, format("%d feed forward gate", il).c_str()); |
|||
} |
|||
} |
|||
|
|||
// residual 2
|
|||
cur = ggml_add(ctx0, cur, embeddings); |
|||
ggml_set_name(cur, format("%d residual", il).c_str()); |
|||
|
|||
embeddings = cur; |
|||
|
|||
return embeddings; |
|||
} |
|||
|
|||
static ggml_cgraph *mllama_image_build_graph(mllama_ctx *ctx, const mllama_image_batch *imgs) { |
|||
const auto &model = ctx->vision_model; |
|||
const auto &hparams = model.hparams; |
|||
|
|||
const int image_size = hparams.image_size; |
|||
const int image_size_width = image_size; |
|||
const int image_size_height = image_size; |
|||
|
|||
const int patch_size = hparams.patch_size; |
|||
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); |
|||
const int num_positions = num_patches + (model.class_embedding == nullptr ? 0 : 1); |
|||
const int hidden_size = hparams.hidden_size; |
|||
const int n_head = hparams.n_head; |
|||
const int d_head = hidden_size / n_head; |
|||
|
|||
const int batch_size = imgs->size; |
|||
REQUIRE(batch_size == 1); |
|||
|
|||
int num_tiles = 4; |
|||
int num_channels = 3; |
|||
if (imgs->data != nullptr) { |
|||
num_tiles = imgs->data[0].num_tiles > 0 ? imgs->data[0].num_tiles : num_tiles; |
|||
num_channels = imgs->data[0].num_channels > 0 ? imgs->data[0].num_channels : num_channels; |
|||
} |
|||
|
|||
struct ggml_init_params params = { |
|||
ctx->buf_compute_meta.size(), // mem_size
|
|||
ctx->buf_compute_meta.data(), // mem_buffer
|
|||
true, // no_alloc
|
|||
}; |
|||
|
|||
struct ggml_context *ctx0 = ggml_init(params); |
|||
struct ggml_cgraph *gf = ggml_new_graph(ctx0); |
|||
|
|||
struct ggml_tensor *inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, num_channels, num_tiles); |
|||
ggml_set_name(inp_raw, "inp_raw"); |
|||
ggml_set_input(inp_raw); |
|||
|
|||
struct ggml_tensor *inp = ggml_conv_2d(ctx0, model.patch_embeddings, inp_raw, patch_size, patch_size, 0, 0, 1, 1); |
|||
|
|||
inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, num_tiles); |
|||
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3)); |
|||
|
|||
struct ggml_tensor *aspect_ratios = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, imgs->size); |
|||
ggml_set_name(aspect_ratios, "aspect_ratios"); |
|||
ggml_set_input(aspect_ratios); |
|||
|
|||
if (model.pre_tile_position_embeddings != nullptr) { |
|||
struct ggml_tensor *pre_tile_position_embeddings = ggml_get_rows(ctx0, model.pre_tile_position_embeddings, aspect_ratios); |
|||
ggml_set_name(pre_tile_position_embeddings, "pre_tile_position_embeddings"); |
|||
|
|||
pre_tile_position_embeddings = ggml_reshape_3d(ctx0, pre_tile_position_embeddings, hidden_size, 1, num_tiles); |
|||
if (model.pre_tile_position_embeddings_gate != nullptr) { |
|||
pre_tile_position_embeddings = ggml_mul_inplace(ctx0, pre_tile_position_embeddings, model.pre_tile_position_embeddings_gate); |
|||
} |
|||
|
|||
inp = ggml_add(ctx0, inp, pre_tile_position_embeddings); |
|||
} |
|||
|
|||
struct ggml_tensor *embeddings = inp; |
|||
|
|||
if (model.class_embedding != nullptr) { |
|||
// concat class_embeddings and patch_embeddings
|
|||
embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, num_tiles); |
|||
ggml_set_name(embeddings, "embeddings"); |
|||
ggml_set_input(embeddings); |
|||
for (int i = 0; i < num_tiles; ++i) { |
|||
// repeat class embeddings for each tile
|
|||
embeddings = ggml_acc_inplace(ctx0, embeddings, model.class_embedding, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], i * embeddings->nb[2]); |
|||
} |
|||
|
|||
embeddings = ggml_acc_inplace(ctx0, embeddings, inp, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]); |
|||
} |
|||
|
|||
struct ggml_tensor *positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions); |
|||
ggml_set_name(positions, "positions"); |
|||
ggml_set_input(positions); |
|||
|
|||
struct ggml_tensor *position_embd = ggml_get_rows(ctx0, model.position_embeddings, positions); |
|||
if (model.position_embeddings_gate != nullptr) { |
|||
position_embd = ggml_mul_inplace(ctx0, position_embd, model.position_embeddings_gate); |
|||
} |
|||
|
|||
embeddings = ggml_add(ctx0, embeddings, position_embd); |
|||
|
|||
if (model.tile_position_embeddings != nullptr) { |
|||
struct ggml_tensor *tile_position_embeddings = ggml_get_rows(ctx0, model.tile_position_embeddings, aspect_ratios); |
|||
ggml_set_name(tile_position_embeddings, "tile_position_embeddings"); |
|||
|
|||
tile_position_embeddings = ggml_reshape_3d(ctx0, tile_position_embeddings, hidden_size, num_positions, num_tiles); |
|||
if (model.tile_position_embeddings_gate != nullptr) { |
|||
tile_position_embeddings = ggml_mul_inplace(ctx0, tile_position_embeddings, model.tile_position_embeddings_gate); |
|||
} |
|||
|
|||
embeddings = ggml_add(ctx0, embeddings, tile_position_embeddings); |
|||
} |
|||
|
|||
// pre-layernorm
|
|||
if (model.pre_ln_w != nullptr) { |
|||
embeddings = ggml_mul(ctx0, ggml_norm(ctx0, embeddings, hparams.eps), model.pre_ln_w); |
|||
if (model.pre_ln_b != nullptr) { |
|||
embeddings = ggml_add(ctx0, embeddings, model.pre_ln_b); |
|||
} |
|||
|
|||
ggml_set_name(embeddings, "pre layernorm"); |
|||
} |
|||
|
|||
const int num_padding_patches = 8 - (embeddings->ne[1] % 8) % 8; |
|||
|
|||
embeddings = ggml_pad(ctx0, embeddings, 0, num_padding_patches, 0, 0); |
|||
embeddings = ggml_view_3d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1] * embeddings->ne[2], batch_size, embeddings->nb[1], embeddings->nb[2] * embeddings->ne[3], 0); |
|||
|
|||
std::vector<struct ggml_tensor *> intermediate_embeddings; |
|||
|
|||
// encoder
|
|||
for (size_t il = 0; il < model.layers.size(); il++) { |
|||
if (hparams.intermediate_layers[il]) { |
|||
intermediate_embeddings.push_back(embeddings); |
|||
} |
|||
|
|||
embeddings = mllama_image_build_encoder_layer( |
|||
ctx0, il, model.layers[il], embeddings, |
|||
hparams.eps, hidden_size, batch_size, n_head, d_head); |
|||
} |
|||
|
|||
// post-layernorm
|
|||
if (model.post_ln_w != nullptr) { |
|||
embeddings = ggml_mul(ctx0, ggml_norm(ctx0, embeddings, hparams.eps), model.post_ln_w); |
|||
if (model.post_ln_b != nullptr) { |
|||
embeddings = ggml_add(ctx0, embeddings, model.post_ln_b); |
|||
} |
|||
|
|||
ggml_set_name(embeddings, "post layernorm"); |
|||
} |
|||
|
|||
embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size, num_positions + num_padding_patches, num_tiles); |
|||
|
|||
if (model.post_tile_position_embeddings != nullptr) { |
|||
struct ggml_tensor *post_tile_position_embeddings = ggml_get_rows(ctx0, model.post_tile_position_embeddings, aspect_ratios); |
|||
ggml_set_name(post_tile_position_embeddings, "post_tile_position_embeddings"); |
|||
|
|||
post_tile_position_embeddings = ggml_reshape_3d(ctx0, post_tile_position_embeddings, hidden_size, 1, num_tiles); |
|||
if (model.post_tile_position_embeddings_gate != nullptr) { |
|||
post_tile_position_embeddings = ggml_mul(ctx0, post_tile_position_embeddings, model.post_tile_position_embeddings_gate); |
|||
} |
|||
|
|||
embeddings = ggml_add(ctx0, embeddings, post_tile_position_embeddings); |
|||
} |
|||
|
|||
embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size, num_tiles * (num_positions + num_padding_patches), 1); |
|||
|
|||
// global encoder
|
|||
for (size_t il = 0; il < model.global_layers.size(); il++) { |
|||
embeddings = mllama_image_build_encoder_layer( |
|||
ctx0, il, model.global_layers[il], embeddings, |
|||
hparams.eps, hidden_size, batch_size, n_head, d_head); |
|||
} |
|||
|
|||
struct ggml_tensor *stacked_embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 0, hidden_size, (num_positions + num_padding_patches) * num_tiles); |
|||
for (size_t i = 0; i < intermediate_embeddings.size(); ++i) { |
|||
stacked_embeddings = ggml_concat(ctx0, stacked_embeddings, ggml_reshape_3d(ctx0, intermediate_embeddings[i], 1, intermediate_embeddings[i]->ne[0], intermediate_embeddings[i]->ne[1]), 0); |
|||
} |
|||
|
|||
stacked_embeddings = ggml_reshape_4d(ctx0, stacked_embeddings, intermediate_embeddings.size() * hidden_size, num_positions + num_padding_patches, num_tiles, batch_size); |
|||
stacked_embeddings = ggml_unpad(ctx0, stacked_embeddings, 0, num_padding_patches, 0, 0); |
|||
|
|||
embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size, num_positions + num_padding_patches, num_tiles); |
|||
embeddings = ggml_unpad(ctx0, embeddings, 0, num_padding_patches, 0, 0); |
|||
embeddings = ggml_concat(ctx0, embeddings, stacked_embeddings, 0); |
|||
|
|||
// mllama projector
|
|||
embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_0_w, embeddings), model.mm_0_b); |
|||
ggml_set_name(embeddings, "multi modal projector"); |
|||
|
|||
// build the graph
|
|||
ggml_build_forward_expand(gf, embeddings); |
|||
|
|||
ggml_free(ctx0); |
|||
|
|||
return gf; |
|||
} |
|||
|
|||
static struct ggml_tensor *mllama_tensor_load(struct ggml_context *ctx, const char *name, const bool optional) { |
|||
struct ggml_tensor *cur = ggml_get_tensor(ctx, name); |
|||
REQUIRE(cur != nullptr || optional); |
|||
return cur; |
|||
} |
|||
|
|||
static std::vector<struct mllama_layer> mllama_layers_load(struct ggml_context *ctx, const char *prefix, const int n) { |
|||
std::vector<struct mllama_layer> layers(n); |
|||
for (size_t i = 0; i < layers.size(); i++) { |
|||
auto &layer = layers[i]; |
|||
layer.ln_1_w = mllama_tensor_load(ctx, format("%s.blk.%d.ln1.weight", prefix, i).c_str(), false); |
|||
layer.ln_1_b = mllama_tensor_load(ctx, format("%s.blk.%d.ln1.bias", prefix, i).c_str(), false); |
|||
layer.ln_2_w = mllama_tensor_load(ctx, format("%s.blk.%d.ln2.weight", prefix, i).c_str(), false); |
|||
layer.ln_2_b = mllama_tensor_load(ctx, format("%s.blk.%d.ln2.bias", prefix, i).c_str(), false); |
|||
|
|||
layer.k_w = mllama_tensor_load(ctx, format("%s.blk.%d.attn_k.weight", prefix, i).c_str(), false); |
|||
layer.k_b = mllama_tensor_load(ctx, format("%s.blk.%d.attn_k.bias", prefix, i).c_str(), true); |
|||
layer.q_w = mllama_tensor_load(ctx, format("%s.blk.%d.attn_q.weight", prefix, i).c_str(), false); |
|||
layer.q_b = mllama_tensor_load(ctx, format("%s.blk.%d.attn_q.bias", prefix, i).c_str(), true); |
|||
layer.v_w = mllama_tensor_load(ctx, format("%s.blk.%d.attn_v.weight", prefix, i).c_str(), false); |
|||
layer.v_b = mllama_tensor_load(ctx, format("%s.blk.%d.attn_v.bias", prefix, i).c_str(), true); |
|||
layer.o_w = mllama_tensor_load(ctx, format("%s.blk.%d.attn_out.weight", prefix, i).c_str(), false); |
|||
layer.o_b = mllama_tensor_load(ctx, format("%s.blk.%d.attn_out.bias", prefix, i).c_str(), true); |
|||
|
|||
layer.ff_i_w = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_down.weight", prefix, i).c_str(), false); |
|||
layer.ff_i_b = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_down.bias", prefix, i).c_str(), false); |
|||
layer.ff_o_w = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_up.weight", prefix, i).c_str(), false); |
|||
layer.ff_o_b = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_up.bias", prefix, i).c_str(), false); |
|||
|
|||
layer.attn_gate = mllama_tensor_load(ctx, format("%s.blk.%d.attn_gate", prefix, i).c_str(), true); |
|||
layer.ff_gate = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_gate", prefix, i).c_str(), true); |
|||
} |
|||
|
|||
return layers; |
|||
} |
|||
|
|||
// read and create ggml_context containing the tensors and their data
|
|||
struct mllama_ctx *mllama_model_load(const char *fname, const int verbosity = 1) { |
|||
struct ggml_context *meta = nullptr; |
|||
|
|||
struct gguf_init_params params = { |
|||
true, // no_alloc
|
|||
&meta, // ctx
|
|||
}; |
|||
|
|||
struct gguf_context *ctx = gguf_init_from_file(fname, params); |
|||
REQUIRE(ctx != nullptr); |
|||
|
|||
if (verbosity >= 1) { |
|||
const int n_tensors = gguf_get_n_tensors(ctx); |
|||
const int n_kv = gguf_get_n_kv(ctx); |
|||
const std::string ftype = get_ftype(get_u32(ctx, "general.file_type")); |
|||
const int idx_desc = get_key_index(ctx, "general.description"); |
|||
const std::string description = gguf_get_val_str(ctx, idx_desc); |
|||
const int idx_name = gguf_find_key(ctx, "general.name"); |
|||
if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug
|
|||
const std::string name = gguf_get_val_str(ctx, idx_name); |
|||
LOG("model name: %s", name.c_str()); |
|||
} |
|||
LOG("description: %s", description.c_str()); |
|||
LOG("GGUF version: %d", gguf_get_version(ctx)); |
|||
LOG("alignment: %zu", gguf_get_alignment(ctx)); |
|||
LOG("n_tensors: %d", n_tensors); |
|||
LOG("n_kv: %d", n_kv); |
|||
LOG("ftype: %s", ftype.c_str()); |
|||
LOG(""); |
|||
} |
|||
const int n_tensors = gguf_get_n_tensors(ctx); |
|||
|
|||
mllama_ctx *new_mllama = new mllama_ctx{}; |
|||
|
|||
ggml_backend_t backend = ggml_backend_init_best(); |
|||
if (backend == nullptr) { |
|||
LOG("%s: failed to initialize backend\n", __func__); |
|||
mllama_free(new_mllama); |
|||
gguf_free(ctx); |
|||
return nullptr; |
|||
} |
|||
LOG("%s: using %s backend\n", __func__, ggml_backend_name(backend)); |
|||
new_mllama->backend = backend; |
|||
|
|||
// load tensors
|
|||
{ |
|||
std::vector<uint8_t> read_buf; |
|||
struct ggml_init_params params = { |
|||
(n_tensors + 1) * ggml_tensor_overhead(), // mem_size
|
|||
nullptr, // mem_buffer
|
|||
true, // no_alloc
|
|||
}; |
|||
|
|||
new_mllama->ctx_data = ggml_init(params); |
|||
if (!new_mllama->ctx_data) { |
|||
LOG("ggml_init() failed"); |
|||
mllama_free(new_mllama); |
|||
gguf_free(ctx); |
|||
return nullptr; |
|||
} |
|||
|
|||
#ifdef _WIN32 |
|||
int wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, NULL, 0); |
|||
if (!wlen) { |
|||
return NULL; |
|||
} |
|||
wchar_t * wbuf = (wchar_t *) malloc(wlen * sizeof(wchar_t)); |
|||
wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, wbuf, wlen); |
|||
if (!wlen) { |
|||
free(wbuf); |
|||
return NULL; |
|||
} |
|||
#if __GLIBCXX__ |
|||
int fd = _wopen(wbuf, _O_RDONLY | _O_BINARY); |
|||
__gnu_cxx::stdio_filebuf<char> buffer(fd, std::ios_base::in); |
|||
std::istream fin(&buffer); |
|||
#else // MSVC
|
|||
// unused in our current build
|
|||
auto fin = std::ifstream(wbuf, std::ios::binary); |
|||
#endif |
|||
free(wbuf); |
|||
#else |
|||
auto fin = std::ifstream(fname, std::ios::binary); |
|||
#endif |
|||
if (!fin) { |
|||
LOG("cannot open model file for loading tensors\n"); |
|||
mllama_free(new_mllama); |
|||
gguf_free(ctx); |
|||
return nullptr; |
|||
} |
|||
|
|||
// add tensors to context
|
|||
for (int i = 0; i < n_tensors; ++i) { |
|||
const char *name = gguf_get_tensor_name(ctx, i); |
|||
struct ggml_tensor *t = ggml_get_tensor(meta, name); |
|||
struct ggml_tensor *cur = ggml_dup_tensor(new_mllama->ctx_data, t); |
|||
ggml_set_name(cur, name); |
|||
} |
|||
|
|||
// alloc memory and offload data
|
|||
new_mllama->params_buffer = ggml_backend_alloc_ctx_tensors(new_mllama->ctx_data, new_mllama->backend); |
|||
for (int i = 0; i < n_tensors; ++i) { |
|||
const char *name = gguf_get_tensor_name(ctx, i); |
|||
struct ggml_tensor *cur = ggml_get_tensor(new_mllama->ctx_data, name); |
|||
const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i); |
|||
fin.seekg(offset, std::ios::beg); |
|||
if (!fin) { |
|||
LOG("failed to seek for tensor %s\n", name); |
|||
mllama_free(new_mllama); |
|||
gguf_free(ctx); |
|||
return nullptr; |
|||
} |
|||
int num_bytes = ggml_nbytes(cur); |
|||
if (ggml_backend_buffer_is_host(new_mllama->params_buffer)) { |
|||
// for the CPU and Metal backend, we can read directly into the tensor
|
|||
fin.read(reinterpret_cast<char *>(cur->data), num_bytes); |
|||
} else { |
|||
// read into a temporary buffer first, then copy to device memory
|
|||
read_buf.resize(num_bytes); |
|||
fin.read(reinterpret_cast<char *>(read_buf.data()), num_bytes); |
|||
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes); |
|||
} |
|||
} |
|||
|
|||
#if defined(_WIN32) && defined(__GLIBCXX__) |
|||
close(fd); |
|||
#else |
|||
fin.close(); |
|||
#endif |
|||
} |
|||
|
|||
// vision model
|
|||
// load vision model
|
|||
auto &vision_model = new_mllama->vision_model; |
|||
auto &hparams = vision_model.hparams; |
|||
hparams.hidden_size = get_u32(ctx, "mllama.vision.embedding_length"); |
|||
hparams.n_head = get_u32(ctx, "mllama.vision.attention.head_count"); |
|||
hparams.n_intermediate = get_u32(ctx, "mllama.vision.feed_forward_length"); |
|||
hparams.n_layer = get_u32(ctx, "mllama.vision.block_count"); |
|||
hparams.n_global_layer = get_u32(ctx, "mllama.vision.global.block_count"); |
|||
hparams.n_tiles = get_u32(ctx, "mllama.vision.max_num_tiles"); |
|||
hparams.image_size = get_u32(ctx, "mllama.vision.image_size"); |
|||
hparams.patch_size = get_u32(ctx, "mllama.vision.patch_size"); |
|||
hparams.projection_dim = get_u32(ctx, "mllama.vision.projection_dim"); |
|||
hparams.eps = get_f32(ctx, "mllama.vision.attention.layer_norm_epsilon"); |
|||
|
|||
std::vector<uint32_t> intermediate_layers_indices = get_u32_array(ctx, "mllama.vision.intermediate_layers_indices"); |
|||
hparams.intermediate_layers.resize(hparams.n_layer); |
|||
for (size_t i = 0; i < intermediate_layers_indices.size(); i++) { |
|||
hparams.intermediate_layers[intermediate_layers_indices[i]] = true; |
|||
} |
|||
|
|||
if (verbosity >= 2) { |
|||
LOG(""); |
|||
LOG("vision model hparams"); |
|||
LOG("image_size %d", hparams.image_size); |
|||
LOG("patch_size %d", hparams.patch_size); |
|||
LOG("v_hidden_size %d", hparams.hidden_size); |
|||
LOG("v_n_intermediate %d", hparams.n_intermediate); |
|||
LOG("v_projection_dim %d", hparams.projection_dim); |
|||
LOG("v_n_head %d", hparams.n_head); |
|||
LOG("v_n_layer %d", hparams.n_layer); |
|||
LOG("v_n_global_layer %d", hparams.n_global_layer); |
|||
LOG("v_eps %f", hparams.eps); |
|||
} |
|||
|
|||
vision_model.class_embedding = mllama_tensor_load(new_mllama->ctx_data, "v.class_embd", true); |
|||
vision_model.patch_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.patch_embd.weight", true); |
|||
|
|||
vision_model.position_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.position_embd.weight", true); |
|||
vision_model.position_embeddings_gate = mllama_tensor_load(new_mllama->ctx_data, "v.position_embd.gate", true); |
|||
|
|||
vision_model.pre_ln_w = mllama_tensor_load(new_mllama->ctx_data, "v.pre_ln.weight", true); |
|||
vision_model.pre_ln_b = mllama_tensor_load(new_mllama->ctx_data, "v.pre_ln.bias", true); |
|||
vision_model.post_ln_w = mllama_tensor_load(new_mllama->ctx_data, "v.post_ln.weight", true); |
|||
vision_model.post_ln_b = mllama_tensor_load(new_mllama->ctx_data, "v.post_ln.bias", true); |
|||
|
|||
vision_model.tile_position_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.tile_position_embd.weight", true); |
|||
vision_model.tile_position_embeddings_gate = mllama_tensor_load(new_mllama->ctx_data, "v.tile_position_embd.gate", true); |
|||
|
|||
vision_model.pre_tile_position_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.pre_tile_position_embd.weight", true); |
|||
vision_model.pre_tile_position_embeddings_gate = mllama_tensor_load(new_mllama->ctx_data, "v.pre_tile_position_embd.gate", true); |
|||
|
|||
vision_model.post_tile_position_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.post_tile_position_embd.weight", true); |
|||
vision_model.post_tile_position_embeddings_gate = mllama_tensor_load(new_mllama->ctx_data, "v.post_tile_position_embd.gate", true); |
|||
|
|||
vision_model.mm_0_w = mllama_tensor_load(new_mllama->ctx_data, "mm.0.weight", false); |
|||
vision_model.mm_0_b = mllama_tensor_load(new_mllama->ctx_data, "mm.0.bias", false); |
|||
|
|||
vision_model.layers = mllama_layers_load(new_mllama->ctx_data, "v", hparams.n_layer); |
|||
vision_model.global_layers = mllama_layers_load(new_mllama->ctx_data, "v.global", hparams.n_global_layer); |
|||
|
|||
ggml_free(meta); |
|||
|
|||
new_mllama->ctx_gguf = ctx; |
|||
|
|||
{ |
|||
// measure mem requirement and allocate
|
|||
new_mllama->buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead()); |
|||
new_mllama->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_mllama->backend)); |
|||
struct mllama_image_batch batch; |
|||
batch.size = 1; |
|||
ggml_cgraph *gf = mllama_image_build_graph(new_mllama, &batch); |
|||
ggml_gallocr_reserve(new_mllama->compute_alloc, gf); |
|||
size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_mllama->compute_alloc, 0); |
|||
LOG("compute allocated memory: %.2f MB", compute_memory_buffer_size / 1024.0 / 1024.0); |
|||
} |
|||
|
|||
return new_mllama; |
|||
} |
|||
|
|||
struct mllama_image *mllama_image_init() { |
|||
return new mllama_image(); |
|||
} |
|||
|
|||
void mllama_image_free(struct mllama_image *img) { delete img; } |
|||
void mllama_image_batch_free(struct mllama_image_batch *batch) { |
|||
if (batch->size > 0) { |
|||
delete[] batch->data; |
|||
batch->size = 0; |
|||
} |
|||
} |
|||
|
|||
bool mllama_image_load_from_data(const void *data, const int n, const int width, const int height, const int num_channels, const int num_tiles, const int aspect_ratio_id, struct mllama_image *img) { |
|||
img->width = width; |
|||
img->height = height; |
|||
img->num_channels = num_channels; |
|||
img->num_tiles = num_tiles; |
|||
img->aspect_ratio_id = aspect_ratio_id; |
|||
img->data.resize(n); |
|||
|
|||
memcpy(img->data.data(), data, n); |
|||
return true; |
|||
} |
|||
|
|||
inline int mllama(int x, int lower, int upper) { |
|||
return std::max(lower, std::min(x, upper)); |
|||
} |
|||
|
|||
void mllama_free(mllama_ctx *ctx) { |
|||
ggml_free(ctx->ctx_data); |
|||
gguf_free(ctx->ctx_gguf); |
|||
|
|||
ggml_backend_buffer_free(ctx->params_buffer); |
|||
ggml_backend_free(ctx->backend); |
|||
ggml_gallocr_free(ctx->compute_alloc); |
|||
delete ctx; |
|||
} |
|||
|
|||
bool mllama_image_encode(struct mllama_ctx *ctx, const int n_threads, mllama_image *img, float *vec) { |
|||
mllama_image_batch imgs{}; |
|||
imgs.size = 1; |
|||
imgs.data = img; |
|||
return mllama_image_batch_encode(ctx, n_threads, &imgs, vec); |
|||
} |
|||
|
|||
bool mllama_image_batch_encode(mllama_ctx *ctx, const int n_threads, const mllama_image_batch *imgs, float *vec) { |
|||
int batch_size = imgs->size; |
|||
REQUIRE(batch_size == 1); |
|||
|
|||
// build the inference graph
|
|||
ggml_cgraph *gf = mllama_image_build_graph(ctx, imgs); |
|||
ggml_gallocr_alloc_graph(ctx->compute_alloc, gf); |
|||
|
|||
// set inputs
|
|||
const auto &model = ctx->vision_model; |
|||
const auto &hparams = model.hparams; |
|||
|
|||
const int image_size = hparams.image_size; |
|||
int image_size_width = image_size; |
|||
int image_size_height = image_size; |
|||
|
|||
const int patch_size = hparams.patch_size; |
|||
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); |
|||
const int num_positions = num_patches + (model.class_embedding == nullptr ? 0 : 1); |
|||
|
|||
{ |
|||
struct ggml_tensor *inp_raw = ggml_graph_get_tensor(gf, "inp_raw"); |
|||
ggml_backend_tensor_set(inp_raw, imgs->data[0].data.data(), 0, ggml_nbytes(inp_raw)); |
|||
} |
|||
|
|||
{ |
|||
struct ggml_tensor *embeddings = ggml_graph_get_tensor(gf, "embeddings"); |
|||
if (embeddings != nullptr) { |
|||
void *zeros = malloc(ggml_nbytes(embeddings)); |
|||
memset(zeros, 0, ggml_nbytes(embeddings)); |
|||
ggml_backend_tensor_set(embeddings, zeros, 0, ggml_nbytes(embeddings)); |
|||
free(zeros); |
|||
} |
|||
} |
|||
|
|||
{ |
|||
struct ggml_tensor *positions = ggml_graph_get_tensor(gf, "positions"); |
|||
if (positions != nullptr) { |
|||
int *positions_data = (int *)malloc(ggml_nbytes(positions)); |
|||
for (int i = 0; i < num_positions; i++) { |
|||
positions_data[i] = i; |
|||
} |
|||
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions)); |
|||
free(positions_data); |
|||
} |
|||
} |
|||
|
|||
{ |
|||
struct ggml_tensor *aspect_ratios = ggml_graph_get_tensor(gf, "aspect_ratios"); |
|||
if (aspect_ratios != nullptr) { |
|||
int *aspect_ratios_data = (int *)malloc(ggml_nbytes(aspect_ratios)); |
|||
aspect_ratios_data[0] = imgs->data[0].aspect_ratio_id; |
|||
ggml_backend_tensor_set(aspect_ratios, aspect_ratios_data, 0, ggml_nbytes(aspect_ratios)); |
|||
free(aspect_ratios_data); |
|||
} |
|||
} |
|||
|
|||
if (ggml_backend_is_cpu(ctx->backend)) { |
|||
ggml_backend_cpu_set_n_threads(ctx->backend, n_threads); |
|||
} |
|||
|
|||
ggml_backend_graph_compute(ctx->backend, gf); |
|||
|
|||
// the last node is the embedding tensor
|
|||
struct ggml_tensor *embeddings = ggml_graph_node(gf, ggml_graph_n_nodes(gf) - 1); |
|||
|
|||
// copy the embeddings to the location passed by the user
|
|||
ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings)); |
|||
|
|||
return true; |
|||
} |
|||
|
|||
int32_t mllama_image_size(const struct mllama_ctx *ctx) { |
|||
return ctx->vision_model.hparams.image_size; |
|||
} |
|||
|
|||
int32_t mllama_patch_size(const struct mllama_ctx *ctx) { |
|||
return ctx->vision_model.hparams.patch_size; |
|||
} |
|||
|
|||
int32_t mllama_hidden_size(const struct mllama_ctx *ctx) { |
|||
return ctx->vision_model.hparams.hidden_size; |
|||
} |
|||
|
|||
int mllama_n_patches(const struct mllama_ctx *ctx) { |
|||
const auto &hparams = ctx->vision_model.hparams; |
|||
return (hparams.image_size / hparams.patch_size) * (hparams.image_size / hparams.patch_size); |
|||
} |
|||
|
|||
int mllama_n_positions(const struct mllama_ctx *ctx) { |
|||
return mllama_n_patches(ctx) + (ctx->vision_model.class_embedding == nullptr ? 0 : 1); |
|||
} |
|||
|
|||
int mllama_n_tiles(const struct mllama_ctx *ctx) { |
|||
return ctx->vision_model.hparams.n_tiles; |
|||
} |
|||
|
|||
int mllama_n_embd(const struct mllama_ctx *ctx) { |
|||
return ctx->vision_model.hparams.projection_dim; |
|||
} |
|||
|
|||
size_t mllama_n_embd_bytes(const struct mllama_ctx *ctx) { |
|||
return mllama_n_positions(ctx) * mllama_n_embd(ctx) * mllama_n_tiles(ctx) * sizeof(float); |
|||
} |
|||
@ -1,61 +0,0 @@ |
|||
#ifndef MLLAMA_H |
|||
#define MLLAMA_H |
|||
|
|||
#include <stddef.h> |
|||
#include <stdint.h> |
|||
|
|||
#ifdef LLAMA_SHARED |
|||
#if defined(_WIN32) && !defined(__MINGW32__) |
|||
#ifdef LLAMA_BUILD |
|||
#define MLLAMA_API __declspec(dllexport) |
|||
#else |
|||
#define MLLAMA_API __declspec(dllimport) |
|||
#endif |
|||
#else |
|||
#define MLLAMA_API __attribute__((visibility("default"))) |
|||
#endif |
|||
#else |
|||
#define MLLAMA_API |
|||
#endif |
|||
|
|||
#ifdef __cplusplus |
|||
extern "C" { |
|||
#endif |
|||
|
|||
struct mllama_ctx; |
|||
|
|||
struct mllama_image_batch { |
|||
struct mllama_image *data; |
|||
size_t size; |
|||
}; |
|||
|
|||
MLLAMA_API struct mllama_ctx *mllama_model_load(const char *fname, int verbosity); |
|||
MLLAMA_API struct mllama_ctx *mllama_model_load_cpu(const char *fname, int verbosity); |
|||
|
|||
MLLAMA_API void mllama_free(struct mllama_ctx *ctx); |
|||
|
|||
MLLAMA_API int32_t mllama_image_size(const struct mllama_ctx *ctx); |
|||
MLLAMA_API int32_t mllama_patch_size(const struct mllama_ctx *ctx); |
|||
MLLAMA_API int32_t mllama_hidden_size(const struct mllama_ctx *ctx); |
|||
|
|||
MLLAMA_API int mllama_n_patches(const struct mllama_ctx *ctx); |
|||
MLLAMA_API int mllama_n_positions(const struct mllama_ctx *ctx); |
|||
MLLAMA_API int mllama_n_tiles(const struct mllama_ctx *ctx); |
|||
MLLAMA_API int mllama_n_embd(const struct mllama_ctx *ctx); |
|||
MLLAMA_API size_t mllama_n_embd_bytes(const struct mllama_ctx *ctx); |
|||
|
|||
MLLAMA_API struct mllama_image *mllama_image_init(); |
|||
|
|||
MLLAMA_API void mllama_image_free(struct mllama_image *img); |
|||
MLLAMA_API void mllama_image_batch_free(struct mllama_image_batch *batch); |
|||
|
|||
MLLAMA_API bool mllama_image_load_from_data(const void *data, const int n, const int nx, const int ny, const int nc, const int nt, const int aspect_ratio_id, struct mllama_image *img); |
|||
|
|||
MLLAMA_API bool mllama_image_encode(struct mllama_ctx *ctx, int n_threads, struct mllama_image *img, float *vec); |
|||
MLLAMA_API bool mllama_image_batch_encode(struct mllama_ctx *ctx, int n_threads, const struct mllama_image_batch *imgs, float *vec); |
|||
|
|||
#ifdef __cplusplus |
|||
} |
|||
#endif |
|||
|
|||
#endif // MLLAMA_H
|
|||
File diff suppressed because it is too large
@ -1,419 +0,0 @@ |
|||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 |
|||
From: jmorganca <jmorganca@gmail.com> |
|||
Date: Sun, 13 Apr 2025 22:10:06 -0400 |
|||
Subject: [PATCH] add unpad operator |
|||
|
|||
adds the unpad operator to GGML |
|||
---
|
|||
ggml/include/ggml.h | 10 +++++ |
|||
ggml/src/ggml-cpu/ggml-cpu.c | 5 +++ |
|||
ggml/src/ggml-cpu/ops.cpp | 55 ++++++++++++++++++++++++++++ |
|||
ggml/src/ggml-cpu/ops.h | 1 + |
|||
ggml/src/ggml-cuda/ggml-cuda.cu | 4 ++ |
|||
ggml/src/ggml-cuda/pad.cu | 46 +++++++++++++++++++++++ |
|||
ggml/src/ggml-cuda/pad.cuh | 1 + |
|||
ggml/src/ggml-metal/ggml-metal.m | 33 +++++++++++++++++ |
|||
ggml/src/ggml-metal/ggml-metal.metal | 45 +++++++++++++++++++++++ |
|||
ggml/src/ggml.c | 25 ++++++++++++- |
|||
10 files changed, 223 insertions(+), 2 deletions(-) |
|||
|
|||
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
|
|||
index e91dedf1..8dc107ba 100644
|
|||
--- a/ggml/include/ggml.h
|
|||
+++ b/ggml/include/ggml.h
|
|||
@@ -489,6 +489,7 @@ extern "C" {
|
|||
GGML_OP_UPSCALE, // nearest interpolate |
|||
GGML_OP_PAD, |
|||
GGML_OP_PAD_REFLECT_1D, |
|||
+ GGML_OP_UNPAD,
|
|||
GGML_OP_ARANGE, |
|||
GGML_OP_TIMESTEP_EMBEDDING, |
|||
GGML_OP_ARGSORT, |
|||
@@ -1781,6 +1782,15 @@ extern "C" {
|
|||
int p0, |
|||
int p1); |
|||
|
|||
+ // unpad each dimension: [x, ..., x, y, ..., y] -> [x, ..., x]
|
|||
+ GGML_API struct ggml_tensor * ggml_unpad(
|
|||
+ struct ggml_context * ctx,
|
|||
+ struct ggml_tensor * a,
|
|||
+ int p0,
|
|||
+ int p1,
|
|||
+ int p2,
|
|||
+ int p3);
|
|||
+
|
|||
// Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151 |
|||
// timesteps: [N,] |
|||
// return: [N, dim] |
|||
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
|
|||
index a30e67f2..835e6495 100644
|
|||
--- a/ggml/src/ggml-cpu/ggml-cpu.c
|
|||
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
|
|||
@@ -1951,6 +1951,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|||
{ |
|||
ggml_compute_forward_pad_reflect_1d(params, tensor); |
|||
} break; |
|||
+ case GGML_OP_UNPAD:
|
|||
+ {
|
|||
+ ggml_compute_forward_unpad(params, tensor);
|
|||
+ } break;
|
|||
case GGML_OP_ARANGE: |
|||
{ |
|||
ggml_compute_forward_arange(params, tensor); |
|||
@@ -2274,6 +2278,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|||
case GGML_OP_UPSCALE: |
|||
case GGML_OP_PAD: |
|||
case GGML_OP_PAD_REFLECT_1D: |
|||
+ case GGML_OP_UNPAD:
|
|||
case GGML_OP_ARANGE: |
|||
case GGML_OP_TIMESTEP_EMBEDDING: |
|||
case GGML_OP_ARGSORT: |
|||
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
|
|||
index 955fec59..1868a10c 100644
|
|||
--- a/ggml/src/ggml-cpu/ops.cpp
|
|||
+++ b/ggml/src/ggml-cpu/ops.cpp
|
|||
@@ -6690,6 +6690,61 @@ void ggml_compute_forward_pad_reflect_1d(
|
|||
} |
|||
} |
|||
|
|||
+// ggml_compute_forward_unpad
|
|||
+
|
|||
+static void ggml_compute_forward_unpad_f32(
|
|||
+ const struct ggml_compute_params *params,
|
|||
+ struct ggml_tensor *dst) {
|
|||
+
|
|||
+ const struct ggml_tensor * src0 = dst->src[0];
|
|||
+
|
|||
+ GGML_ASSERT(src0->nb[0] == sizeof(float));
|
|||
+ GGML_ASSERT( dst->nb[0] == sizeof(float));
|
|||
+
|
|||
+ const int ith = params->ith;
|
|||
+ const int nth = params->nth;
|
|||
+
|
|||
+ GGML_TENSOR_UNARY_OP_LOCALS
|
|||
+
|
|||
+ float * dst_ptr = (float *) dst->data;
|
|||
+
|
|||
+ // TODO: optimize
|
|||
+
|
|||
+ for (int64_t i2 = 0; i2 < ne2; ++i2) {
|
|||
+ for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
|
|||
+ for (int64_t i0 = 0; i0 < ne0; ++i0) {
|
|||
+ for (int64_t i3 = 0; i3 < ne3; ++i3) {
|
|||
+ const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
|
|||
+
|
|||
+ const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
|||
+
|
|||
+ if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
|
|||
+ dst_ptr[dst_idx] = *src_ptr;
|
|||
+ }
|
|||
+ }
|
|||
+ }
|
|||
+ }
|
|||
+ }
|
|||
+}
|
|||
+
|
|||
+void ggml_compute_forward_unpad(
|
|||
+ const struct ggml_compute_params * params,
|
|||
+ struct ggml_tensor * dst) {
|
|||
+
|
|||
+ const struct ggml_tensor * src0 = dst->src[0];
|
|||
+
|
|||
+ switch (src0->type) {
|
|||
+ case GGML_TYPE_F32:
|
|||
+ {
|
|||
+ ggml_compute_forward_unpad_f32(params, dst);
|
|||
+ } break;
|
|||
+ default:
|
|||
+ {
|
|||
+ GGML_ABORT("fatal error");
|
|||
+ }
|
|||
+ }
|
|||
+}
|
|||
+
|
|||
// ggml_compute_forward_arange |
|||
|
|||
static void ggml_compute_forward_arange_f32( |
|||
diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h
|
|||
index dc081b9e..a7125555 100644
|
|||
--- a/ggml/src/ggml-cpu/ops.h
|
|||
+++ b/ggml/src/ggml-cpu/ops.h
|
|||
@@ -72,6 +72,7 @@ void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params
|
|||
void ggml_compute_forward_upscale(const struct ggml_compute_params * params, struct ggml_tensor * dst); |
|||
void ggml_compute_forward_pad(const struct ggml_compute_params * params, struct ggml_tensor * dst); |
|||
void ggml_compute_forward_pad_reflect_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst); |
|||
+void ggml_compute_forward_unpad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|||
void ggml_compute_forward_arange(const struct ggml_compute_params * params, struct ggml_tensor * dst); |
|||
void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst); |
|||
void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst); |
|||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
|||
index cb0d8528..6fe86674 100644
|
|||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
|||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
|||
@@ -2238,6 +2238,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
|||
case GGML_OP_PAD: |
|||
ggml_cuda_op_pad(ctx, dst); |
|||
break; |
|||
+ case GGML_OP_UNPAD:
|
|||
+ ggml_cuda_op_unpad(ctx, dst);
|
|||
+ break;
|
|||
case GGML_OP_ARANGE: |
|||
ggml_cuda_op_arange(ctx, dst); |
|||
break; |
|||
@@ -3212,6 +3215,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
|||
case GGML_OP_UPSCALE: |
|||
return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST; |
|||
case GGML_OP_PAD: |
|||
+ case GGML_OP_UNPAD:
|
|||
case GGML_OP_ARANGE: |
|||
case GGML_OP_TIMESTEP_EMBEDDING: |
|||
case GGML_OP_LEAKY_RELU: |
|||
diff --git a/ggml/src/ggml-cuda/pad.cu b/ggml/src/ggml-cuda/pad.cu
|
|||
index 77432b04..7d45a7e1 100644
|
|||
--- a/ggml/src/ggml-cuda/pad.cu
|
|||
+++ b/ggml/src/ggml-cuda/pad.cu
|
|||
@@ -47,3 +47,49 @@ void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|||
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], |
|||
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream); |
|||
} |
|||
+
|
|||
+static __global__ void unpad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03) {
|
|||
+ // blockIdx.z: idx of ne2*ne3, aka ne02*ne03
|
|||
+ // blockIdx.y: idx of ne1
|
|||
+ // blockIDx.x: idx of ne0 / BLOCK_SIZE
|
|||
+ int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
|||
+ if (nidx >= ne0) {
|
|||
+ return;
|
|||
+ }
|
|||
+
|
|||
+ // operation
|
|||
+ int offset_dst =
|
|||
+ nidx +
|
|||
+ blockIdx.y * ne0 +
|
|||
+ blockIdx.z * ne0 * gridDim.y;
|
|||
+ if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02*ne03) {
|
|||
+ int offset_src =
|
|||
+ nidx +
|
|||
+ blockIdx.y * ne00 +
|
|||
+ blockIdx.z * ne00 * ne01;
|
|||
+ dst[offset_dst] = x[offset_src];
|
|||
+ }
|
|||
+}
|
|||
+
|
|||
+static void unpad_f32_cuda(const float * x, float * dst,
|
|||
+ const int ne00, const int ne01, const int ne02, const int ne03,
|
|||
+ const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) {
|
|||
+ int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
|
|||
+ dim3 gridDim(num_blocks, ne1, ne2*ne3);
|
|||
+ unpad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02, ne03);
|
|||
+}
|
|||
+
|
|||
+void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|||
+ const ggml_tensor * src0 = dst->src[0];
|
|||
+ const float * src0_d = (const float *)src0->data;
|
|||
+ float * dst_d = (float *)dst->data;
|
|||
+ cudaStream_t stream = ctx.stream();
|
|||
+
|
|||
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|||
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|||
+ GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
|
|||
+
|
|||
+ unpad_f32_cuda(src0_d, dst_d,
|
|||
+ src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
|
|||
+ dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
|
|||
+}
|
|||
\ No newline at end of file |
|||
diff --git a/ggml/src/ggml-cuda/pad.cuh b/ggml/src/ggml-cuda/pad.cuh
|
|||
index 8fd386b0..e2ededc3 100644
|
|||
--- a/ggml/src/ggml-cuda/pad.cuh
|
|||
+++ b/ggml/src/ggml-cuda/pad.cuh
|
|||
@@ -3,3 +3,4 @@
|
|||
#define CUDA_PAD_BLOCK_SIZE 256 |
|||
|
|||
void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst); |
|||
+void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
|||
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
|
|||
index 1b56f858..7641247e 100644
|
|||
--- a/ggml/src/ggml-metal/ggml-metal.m
|
|||
+++ b/ggml/src/ggml-metal/ggml-metal.m
|
|||
@@ -347,6 +347,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
|
|||
GGML_METAL_KERNEL_TYPE_UPSCALE_F32, |
|||
GGML_METAL_KERNEL_TYPE_PAD_F32, |
|||
GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, |
|||
+ GGML_METAL_KERNEL_TYPE_UNPAD_F32,
|
|||
GGML_METAL_KERNEL_TYPE_ARANGE_F32, |
|||
GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, |
|||
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, |
|||
@@ -1294,6 +1295,7 @@ @implementation GGMLMetalClass
|
|||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true); |
|||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true); |
|||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, pad_reflect_1d_f32, true); |
|||
+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UNPAD_F32, unpad_f32, true);
|
|||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true); |
|||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true); |
|||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true); |
|||
@@ -1655,6 +1657,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
|
|||
case GGML_OP_POOL_2D: |
|||
case GGML_OP_PAD: |
|||
case GGML_OP_PAD_REFLECT_1D: |
|||
+ case GGML_OP_UNPAD:
|
|||
case GGML_OP_TIMESTEP_EMBEDDING: |
|||
case GGML_OP_ARGSORT: |
|||
case GGML_OP_LEAKY_RELU: |
|||
@@ -4184,6 +4187,36 @@ static bool ggml_metal_encode_node(
|
|||
|
|||
const int nth = MIN(1024, ne0); |
|||
|
|||
+ [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
|||
+ } break;
|
|||
+ case GGML_OP_UNPAD:
|
|||
+ {
|
|||
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|||
+
|
|||
+ id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_UNPAD_F32].pipeline;
|
|||
+
|
|||
+ [encoder setComputePipelineState:pipeline];
|
|||
+ [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
|||
+ [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
|||
+ [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
|
|||
+ [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
|
|||
+ [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
|
|||
+ [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
|
|||
+ [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
|
|||
+ [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
|
|||
+ [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
|
|||
+ [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
|
|||
+ [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:10];
|
|||
+ [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:11];
|
|||
+ [encoder setBytes:&ne2 length:sizeof(ne2) atIndex:12];
|
|||
+ [encoder setBytes:&ne3 length:sizeof(ne3) atIndex:13];
|
|||
+ [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:14];
|
|||
+ [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15];
|
|||
+ [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:16];
|
|||
+ [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17];
|
|||
+
|
|||
+ const int nth = MIN(1024, ne0);
|
|||
+
|
|||
[encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; |
|||
} break; |
|||
case GGML_OP_ARANGE: |
|||
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
|
|||
index 9cfddf45..080a943b 100644
|
|||
--- a/ggml/src/ggml-metal/ggml-metal.metal
|
|||
+++ b/ggml/src/ggml-metal/ggml-metal.metal
|
|||
@@ -3121,6 +3121,51 @@ kernel void kernel_pad_reflect_1d_f32(
|
|||
} |
|||
} |
|||
|
|||
+kernel void kernel_unpad_f32(
|
|||
+ device const char * src0,
|
|||
+ device char * dst,
|
|||
+ constant int64_t & ne00,
|
|||
+ constant int64_t & ne01,
|
|||
+ constant int64_t & ne02,
|
|||
+ constant int64_t & ne03,
|
|||
+ constant uint64_t & nb00,
|
|||
+ constant uint64_t & nb01,
|
|||
+ constant uint64_t & nb02,
|
|||
+ constant uint64_t & nb03,
|
|||
+ constant int64_t & ne0,
|
|||
+ constant int64_t & ne1,
|
|||
+ constant int64_t & ne2,
|
|||
+ constant int64_t & ne3,
|
|||
+ constant uint64_t & nb0,
|
|||
+ constant uint64_t & nb1,
|
|||
+ constant uint64_t & nb2,
|
|||
+ constant uint64_t & nb3,
|
|||
+ uint3 tgpig[[threadgroup_position_in_grid]],
|
|||
+ uint3 tpitg[[thread_position_in_threadgroup]],
|
|||
+ uint3 ntg[[threads_per_threadgroup]]) {
|
|||
+
|
|||
+ const int64_t i3 = tgpig.z;
|
|||
+ const int64_t i2 = tgpig.y;
|
|||
+ const int64_t i1 = tgpig.x;
|
|||
+
|
|||
+ const int64_t i03 = i3;
|
|||
+ const int64_t i02 = i2;
|
|||
+ const int64_t i01 = i1;
|
|||
+
|
|||
+ device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01);
|
|||
+ device float * dst_ptr = (device float *) (dst + i3*nb3 + i2*nb2 + i1*nb1);
|
|||
+
|
|||
+ if (i1 < ne01 && i2 < ne02 && i3 < ne03) {
|
|||
+ for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
|
|||
+ if (i0 < ne00) {
|
|||
+ dst_ptr[i0] = src0_ptr[i0];
|
|||
+ }
|
|||
+ }
|
|||
+
|
|||
+ return;
|
|||
+ }
|
|||
+}
|
|||
+
|
|||
kernel void kernel_arange_f32( |
|||
device char * dst, |
|||
constant ggml_metal_kargs_arange & args, |
|||
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
|
|||
index 8a654624..6b034d35 100644
|
|||
--- a/ggml/src/ggml.c
|
|||
+++ b/ggml/src/ggml.c
|
|||
@@ -923,6 +923,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|||
"UPSCALE", |
|||
"PAD", |
|||
"PAD_REFLECT_1D", |
|||
+ "UNPAD",
|
|||
"ARANGE", |
|||
"TIMESTEP_EMBEDDING", |
|||
"ARGSORT", |
|||
@@ -953,7 +954,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|||
"OPT_STEP_ADAMW", |
|||
}; |
|||
|
|||
-static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
|
|||
+static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
|
|||
|
|||
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { |
|||
"none", |
|||
@@ -1018,6 +1019,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|||
"upscale(x)", |
|||
"pad(x)", |
|||
"pad_reflect_1d(x)", |
|||
+ "unpad(x)",
|
|||
"arange(start, stop, step)", |
|||
"timestep_embedding(timesteps, dim, max_period)", |
|||
"argsort(x)", |
|||
@@ -1048,7 +1050,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|||
"adamw(x)", |
|||
}; |
|||
|
|||
-static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
|
|||
+static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
|
|||
|
|||
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); |
|||
|
|||
@@ -4274,6 +4276,25 @@ struct ggml_tensor * ggml_pad_reflect_1d(
|
|||
return result; |
|||
} |
|||
|
|||
+// ggml_unpad
|
|||
+
|
|||
+struct ggml_tensor * ggml_unpad(
|
|||
+ struct ggml_context * ctx,
|
|||
+ struct ggml_tensor * a,
|
|||
+ int p0, int p1, int p2, int p3) {
|
|||
+
|
|||
+ struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
|
|||
+ a->ne[0] - p0,
|
|||
+ a->ne[1] - p1,
|
|||
+ a->ne[2] - p2,
|
|||
+ a->ne[3] - p3);
|
|||
+
|
|||
+ result->op = GGML_OP_UNPAD;
|
|||
+ result->src[0] = a;
|
|||
+
|
|||
+ return result;
|
|||
+}
|
|||
+
|
|||
// ggml_arange |
|||
|
|||
struct ggml_tensor * ggml_arange( |
|||
@ -1,201 +0,0 @@ |
|||
package mllama |
|||
|
|||
import ( |
|||
"fmt" |
|||
"image" |
|||
_ "image/jpeg" |
|||
_ "image/png" |
|||
"io" |
|||
"math" |
|||
"slices" |
|||
|
|||
"golang.org/x/image/draw" |
|||
|
|||
"github.com/ollama/ollama/model/imageproc" |
|||
) |
|||
|
|||
func getSupportedAspectRatios(maxTiles int) []image.Point { |
|||
ratios := []image.Point{} |
|||
|
|||
for w := range maxTiles { |
|||
for h := range maxTiles { |
|||
if (w+1)*(h+1) <= maxTiles { |
|||
ratios = append(ratios, image.Point{w + 1, h + 1}) |
|||
} |
|||
} |
|||
} |
|||
|
|||
return ratios |
|||
} |
|||
|
|||
func clip(a, a_min, a_max int) int { |
|||
if a < a_min { |
|||
return a_min |
|||
} else if a > a_max { |
|||
return a_max |
|||
} |
|||
|
|||
return a |
|||
} |
|||
|
|||
func getOptimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) image.Point { |
|||
possibleTileArrangements := getSupportedAspectRatios(maxImageTiles) |
|||
possibleCanvasSizes := []image.Point{} |
|||
for _, pta := range possibleTileArrangements { |
|||
possibleCanvasSizes = append(possibleCanvasSizes, image.Point{pta.X * tileSize, pta.Y * tileSize}) |
|||
} |
|||
|
|||
scales := []float64{} |
|||
|
|||
for _, pcs := range possibleCanvasSizes { |
|||
scaleHeight := float64(pcs.Y) / float64(imageSize.Y) |
|||
scaleWidth := float64(pcs.X) / float64(imageSize.X) |
|||
|
|||
if scaleWidth > scaleHeight { |
|||
scales = append(scales, scaleHeight) |
|||
} else { |
|||
scales = append(scales, scaleWidth) |
|||
} |
|||
} |
|||
|
|||
var minUpscale float64 |
|||
var maxDownscale float64 |
|||
var upscale bool |
|||
|
|||
for _, s := range scales { |
|||
if s > 1.0 { |
|||
upscale = true |
|||
if minUpscale == 0 { |
|||
minUpscale = s |
|||
} else { |
|||
minUpscale = math.Min(minUpscale, s) |
|||
} |
|||
} else { |
|||
maxDownscale = math.Max(maxDownscale, s) |
|||
} |
|||
} |
|||
|
|||
selectedScale := maxDownscale |
|||
if upscale { |
|||
selectedScale = minUpscale |
|||
} |
|||
|
|||
var selectedCanvas image.Point |
|||
for n, pcs := range possibleCanvasSizes { |
|||
if scales[n] == selectedScale { |
|||
// choose the smallest possible canvas
|
|||
if selectedCanvas.X == 0 && selectedCanvas.Y == 0 { |
|||
selectedCanvas = pcs |
|||
} else if pcs.X*pcs.Y < selectedCanvas.X*selectedCanvas.Y { |
|||
selectedCanvas = pcs |
|||
} |
|||
} |
|||
} |
|||
return selectedCanvas |
|||
} |
|||
|
|||
func getImageSizeFitToCanvas(imageSize, canvasSize image.Point, tileSize int) image.Point { |
|||
targetWidth := clip(imageSize.X, tileSize, canvasSize.X) |
|||
targetHeight := clip(imageSize.Y, tileSize, canvasSize.Y) |
|||
|
|||
scaleWidth := float64(targetWidth) / float64(imageSize.X) |
|||
scaleHeight := float64(targetHeight) / float64(imageSize.Y) |
|||
|
|||
var w, h int |
|||
|
|||
if scaleWidth < scaleHeight { |
|||
w = targetWidth |
|||
h = min(int(math.Floor(float64(imageSize.Y)*scaleWidth)), targetHeight) |
|||
} else { |
|||
w = min(int(math.Floor(float64(imageSize.X)*scaleHeight)), targetWidth) |
|||
h = targetHeight |
|||
} |
|||
|
|||
return image.Point{w, h} |
|||
} |
|||
|
|||
func resizeImage(img image.Image, format string, outputSize image.Point, maxImageTiles int) (image.Image, image.Point) { |
|||
if format == "png" { |
|||
img = imageproc.Composite(img) |
|||
} |
|||
|
|||
b := img.Bounds() |
|||
tileSize := outputSize.Y |
|||
|
|||
canvasSize := getOptimalTiledCanvas(b.Max, maxImageTiles, tileSize) |
|||
aspectRatio := image.Point{canvasSize.X / tileSize, canvasSize.Y / tileSize} |
|||
newSize := getImageSizeFitToCanvas(b.Max, canvasSize, tileSize) |
|||
|
|||
return imageproc.Resize(img, newSize, imageproc.ResizeBilinear), aspectRatio |
|||
} |
|||
|
|||
func padImage(img image.Image, outputSize, aspectRatio image.Point) image.Image { |
|||
paddedSize := image.Point{ |
|||
X: outputSize.X * aspectRatio.X, |
|||
Y: outputSize.Y * aspectRatio.Y, |
|||
} |
|||
|
|||
dst := image.NewRGBA(image.Rect(0, 0, paddedSize.X, paddedSize.Y)) |
|||
draw.Draw(dst, img.Bounds(), img, image.Point{0, 0}, draw.Over) |
|||
|
|||
return dst |
|||
} |
|||
|
|||
func splitToTiles(img image.Image, numTilesSize image.Point) []image.Image { |
|||
b := img.Bounds() |
|||
width := b.Max.X - b.Min.X |
|||
height := b.Max.Y - b.Min.Y |
|||
tileHeight := height / numTilesSize.Y |
|||
tileWidth := width / numTilesSize.X |
|||
|
|||
images := []image.Image{} |
|||
|
|||
for h := range numTilesSize.Y { |
|||
for w := range numTilesSize.X { |
|||
rect := image.Rect(tileWidth*w, tileHeight*h, tileWidth*(w+1), tileHeight*(h+1)) |
|||
images = append(images, img.(interface { |
|||
SubImage(image.Rectangle) image.Image |
|||
}).SubImage(rect)) |
|||
} |
|||
} |
|||
|
|||
return images |
|||
} |
|||
|
|||
func packImages(img image.Image, aspectRatio image.Point) []float32 { |
|||
subImages := splitToTiles(img, aspectRatio) |
|||
|
|||
var pixelVals []float32 |
|||
|
|||
rescale := true |
|||
channelFirst := true |
|||
|
|||
for _, subImg := range subImages { |
|||
vals := imageproc.Normalize(subImg, imageproc.ClipDefaultMean, imageproc.ClipDefaultSTD, rescale, channelFirst) |
|||
pixelVals = append(pixelVals, vals...) |
|||
} |
|||
|
|||
return pixelVals |
|||
} |
|||
|
|||
func Preprocess(imageData io.Reader) ([]float32, map[string]any, error) { |
|||
outputSize := image.Point{560, 560} |
|||
maxTiles := 4 |
|||
|
|||
img, format, err := image.Decode(imageData) |
|||
if err != nil { |
|||
return nil, nil, fmt.Errorf("failed to decode image: %w", err) |
|||
} |
|||
|
|||
newImage, aspectRatio := resizeImage(img, format, outputSize, maxTiles) |
|||
newImage = padImage(newImage, outputSize, aspectRatio) |
|||
|
|||
data := packImages(newImage, aspectRatio) |
|||
aspectRatioIndex := slices.Index(getSupportedAspectRatios(maxTiles), aspectRatio) + 1 |
|||
|
|||
opts := map[string]any{ |
|||
"aspectRatioIndex": aspectRatioIndex, |
|||
} |
|||
|
|||
return data, opts, nil |
|||
} |
|||
@ -1,420 +0,0 @@ |
|||
package mllama |
|||
|
|||
import ( |
|||
"bytes" |
|||
"image" |
|||
"image/png" |
|||
"testing" |
|||
|
|||
"github.com/google/go-cmp/cmp" |
|||
) |
|||
|
|||
func TestAspectRatios(t *testing.T) { |
|||
type aspectCase struct { |
|||
MaxTiles int |
|||
Expected []image.Point |
|||
} |
|||
|
|||
cases := []aspectCase{ |
|||
{ |
|||
MaxTiles: 1, |
|||
Expected: []image.Point{{1, 1}}, |
|||
}, |
|||
{ |
|||
MaxTiles: 2, |
|||
Expected: []image.Point{{1, 1}, {1, 2}, {2, 1}}, |
|||
}, |
|||
{ |
|||
MaxTiles: 3, |
|||
Expected: []image.Point{{1, 1}, {1, 2}, {1, 3}, {2, 1}, {3, 1}}, |
|||
}, |
|||
{ |
|||
MaxTiles: 4, |
|||
Expected: []image.Point{{1, 1}, {1, 2}, {1, 3}, {1, 4}, {2, 1}, {2, 2}, {3, 1}, {4, 1}}, |
|||
}, |
|||
} |
|||
|
|||
for _, c := range cases { |
|||
actual := getSupportedAspectRatios(c.MaxTiles) |
|||
|
|||
if diff := cmp.Diff(actual, c.Expected); diff != "" { |
|||
t.Errorf("mismatch (-got +want):\n%s", diff) |
|||
} |
|||
} |
|||
} |
|||
|
|||
func TestGetImageSizeFitToCanvas(t *testing.T) { |
|||
type imageSizeCase struct { |
|||
ImageRect image.Point |
|||
CanvasRect image.Point |
|||
TileSize int |
|||
Expected image.Point |
|||
} |
|||
|
|||
cases := []imageSizeCase{ |
|||
{ |
|||
ImageRect: image.Point{400, 400}, |
|||
CanvasRect: image.Point{640, 480}, |
|||
TileSize: 200, |
|||
Expected: image.Point{400, 400}, |
|||
}, |
|||
{ |
|||
ImageRect: image.Point{1024, 768}, |
|||
CanvasRect: image.Point{640, 480}, |
|||
TileSize: 200, |
|||
Expected: image.Point{640, 480}, |
|||
}, |
|||
{ |
|||
ImageRect: image.Point{500, 500}, |
|||
CanvasRect: image.Point{1000, 1000}, |
|||
TileSize: 750, |
|||
Expected: image.Point{750, 750}, |
|||
}, |
|||
{ |
|||
ImageRect: image.Point{500, 1000}, |
|||
CanvasRect: image.Point{2000, 2000}, |
|||
TileSize: 2000, |
|||
Expected: image.Point{1000, 2000}, |
|||
}, |
|||
{ |
|||
ImageRect: image.Point{4000, 3000}, |
|||
CanvasRect: image.Point{2000, 1000}, |
|||
TileSize: 1000, |
|||
Expected: image.Point{1333, 1000}, |
|||
}, |
|||
{ |
|||
ImageRect: image.Point{667, 1000}, |
|||
CanvasRect: image.Point{1000, 1000}, |
|||
TileSize: 560, |
|||
Expected: image.Point{667, 1000}, |
|||
}, |
|||
} |
|||
|
|||
for _, c := range cases { |
|||
actual := getImageSizeFitToCanvas(c.ImageRect, c.CanvasRect, c.TileSize) |
|||
|
|||
if actual != c.Expected { |
|||
t.Errorf("incorrect image rect: '%#v'. expected: '%#v'", actual, c.Expected) |
|||
} |
|||
} |
|||
} |
|||
|
|||
func TestGetOptimalTiledCanvas(t *testing.T) { |
|||
type tiledCanvasSizeCase struct { |
|||
ImageSize image.Point |
|||
MaxImageTiles int |
|||
TileSize int |
|||
Expected image.Point |
|||
} |
|||
|
|||
cases := []tiledCanvasSizeCase{ |
|||
{ |
|||
ImageSize: image.Point{1024, 768}, |
|||
MaxImageTiles: 4, |
|||
TileSize: 1000, |
|||
Expected: image.Point{2000, 1000}, |
|||
}, |
|||
{ |
|||
ImageSize: image.Point{1024, 768}, |
|||
MaxImageTiles: 4, |
|||
TileSize: 560, |
|||
Expected: image.Point{1120, 1120}, |
|||
}, |
|||
{ |
|||
ImageSize: image.Point{800, 600}, |
|||
MaxImageTiles: 4, |
|||
TileSize: 560, |
|||
Expected: image.Point{1120, 1120}, |
|||
}, |
|||
{ |
|||
ImageSize: image.Point{640, 480}, |
|||
MaxImageTiles: 4, |
|||
TileSize: 560, |
|||
Expected: image.Point{1120, 560}, |
|||
}, |
|||
{ |
|||
ImageSize: image.Point{320, 200}, |
|||
MaxImageTiles: 4, |
|||
TileSize: 560, |
|||
Expected: image.Point{560, 560}, |
|||
}, |
|||
{ |
|||
ImageSize: image.Point{1320, 200}, |
|||
MaxImageTiles: 4, |
|||
TileSize: 560, |
|||
Expected: image.Point{1680, 560}, |
|||
}, |
|||
{ |
|||
ImageSize: image.Point{2000, 200}, |
|||
MaxImageTiles: 4, |
|||
TileSize: 560, |
|||
Expected: image.Point{2240, 560}, |
|||
}, |
|||
{ |
|||
ImageSize: image.Point{10000, 200}, |
|||
MaxImageTiles: 4, |
|||
TileSize: 560, |
|||
Expected: image.Point{2240, 560}, |
|||
}, |
|||
{ |
|||
ImageSize: image.Point{480, 640}, |
|||
MaxImageTiles: 4, |
|||
TileSize: 560, |
|||
Expected: image.Point{560, 1120}, |
|||
}, |
|||
{ |
|||
ImageSize: image.Point{200, 320}, |
|||
MaxImageTiles: 4, |
|||
TileSize: 560, |
|||
Expected: image.Point{560, 560}, |
|||
}, |
|||
{ |
|||
ImageSize: image.Point{200, 1320}, |
|||
MaxImageTiles: 4, |
|||
TileSize: 560, |
|||
Expected: image.Point{560, 1680}, |
|||
}, |
|||
{ |
|||
ImageSize: image.Point{200, 2000}, |
|||
MaxImageTiles: 4, |
|||
TileSize: 560, |
|||
Expected: image.Point{560, 2240}, |
|||
}, |
|||
{ |
|||
ImageSize: image.Point{200, 10000}, |
|||
MaxImageTiles: 4, |
|||
TileSize: 560, |
|||
Expected: image.Point{560, 2240}, |
|||
}, |
|||
{ |
|||
ImageSize: image.Point{10000, 10000}, |
|||
MaxImageTiles: 4, |
|||
TileSize: 560, |
|||
Expected: image.Point{1120, 1120}, |
|||
}, |
|||
} |
|||
|
|||
for _, c := range cases { |
|||
actual := getOptimalTiledCanvas(c.ImageSize, c.MaxImageTiles, c.TileSize) |
|||
|
|||
if actual != c.Expected { |
|||
t.Errorf("incorrect tiled canvas: '%#v'. expected: '%#v'", actual, c.Expected) |
|||
} |
|||
} |
|||
} |
|||
|
|||
func TestSplitToTiles(t *testing.T) { |
|||
type splitCase struct { |
|||
TestImage image.Image |
|||
NumTilesSize image.Point |
|||
Expected []image.Image |
|||
} |
|||
|
|||
cases := []splitCase{ |
|||
{ |
|||
TestImage: image.NewRGBA(image.Rect(0, 0, 1024, 768)), |
|||
NumTilesSize: image.Point{1, 1}, |
|||
Expected: []image.Image{image.NewRGBA(image.Rect(0, 0, 1024, 768))}, |
|||
}, |
|||
{ |
|||
TestImage: image.NewRGBA(image.Rect(0, 0, 1000, 500)), |
|||
NumTilesSize: image.Point{2, 1}, |
|||
Expected: []image.Image{ |
|||
image.NewRGBA(image.Rect(0, 0, 500, 500)), |
|||
image.NewRGBA(image.Rect(500, 0, 1000, 500)), |
|||
}, |
|||
}, |
|||
{ |
|||
TestImage: image.NewRGBA(image.Rect(0, 0, 1000, 1000)), |
|||
NumTilesSize: image.Point{2, 2}, |
|||
Expected: []image.Image{ |
|||
image.NewRGBA(image.Rect(0, 0, 500, 500)), |
|||
image.NewRGBA(image.Rect(500, 0, 1000, 500)), |
|||
image.NewRGBA(image.Rect(0, 500, 500, 1000)), |
|||
image.NewRGBA(image.Rect(500, 500, 1000, 1000)), |
|||
}, |
|||
}, |
|||
} |
|||
|
|||
for _, c := range cases { |
|||
actual := splitToTiles(c.TestImage, c.NumTilesSize) |
|||
|
|||
if len(actual) != len(c.Expected) { |
|||
t.Errorf("incorrect number of images '%d': expected: '%d'", len(actual), len(c.Expected)) |
|||
} |
|||
|
|||
for i := range actual { |
|||
if actual[i].Bounds() != c.Expected[i].Bounds() { |
|||
t.Errorf("image size incorrect: '%#v': expected: '%#v'", actual[i].Bounds(), c.Expected[i].Bounds()) |
|||
} |
|||
} |
|||
} |
|||
} |
|||
|
|||
func TestResize(t *testing.T) { |
|||
type resizeCase struct { |
|||
TestImage image.Image |
|||
OutputSize image.Point |
|||
MaxImageTiles int |
|||
ExpectedImage image.Image |
|||
ExpectedAspectRatio image.Point |
|||
} |
|||
|
|||
cases := []resizeCase{ |
|||
{ |
|||
TestImage: image.NewRGBA(image.Rect(0, 0, 200, 200)), |
|||
OutputSize: image.Point{100, 100}, |
|||
MaxImageTiles: 1, |
|||
ExpectedImage: image.NewRGBA(image.Rect(0, 0, 100, 100)), |
|||
ExpectedAspectRatio: image.Point{1, 1}, |
|||
}, |
|||
{ |
|||
TestImage: image.NewRGBA(image.Rect(0, 0, 200, 200)), |
|||
OutputSize: image.Point{100, 100}, |
|||
MaxImageTiles: 2, |
|||
ExpectedImage: image.NewRGBA(image.Rect(0, 0, 100, 100)), |
|||
ExpectedAspectRatio: image.Point{1, 1}, |
|||
}, |
|||
{ |
|||
TestImage: image.NewRGBA(image.Rect(0, 0, 10, 10)), |
|||
OutputSize: image.Point{560, 560}, |
|||
MaxImageTiles: 4, |
|||
ExpectedImage: image.NewRGBA(image.Rect(0, 0, 560, 560)), |
|||
ExpectedAspectRatio: image.Point{1, 1}, |
|||
}, |
|||
{ |
|||
TestImage: image.NewRGBA(image.Rect(0, 0, 2560, 1920)), |
|||
OutputSize: image.Point{560, 560}, |
|||
MaxImageTiles: 4, |
|||
ExpectedImage: image.NewRGBA(image.Rect(0, 0, 1120, 840)), |
|||
ExpectedAspectRatio: image.Point{2, 2}, |
|||
}, |
|||
{ |
|||
TestImage: image.NewRGBA(image.Rect(0, 0, 1024, 768)), |
|||
OutputSize: image.Point{560, 560}, |
|||
MaxImageTiles: 4, |
|||
ExpectedImage: image.NewRGBA(image.Rect(0, 0, 1024, 768)), |
|||
ExpectedAspectRatio: image.Point{2, 2}, |
|||
}, |
|||
} |
|||
|
|||
for _, c := range cases { |
|||
actualImage, actualAspectRatio := resizeImage(c.TestImage, "png", c.OutputSize, c.MaxImageTiles) |
|||
|
|||
if actualImage.Bounds() != c.ExpectedImage.Bounds() { |
|||
t.Errorf("image size incorrect: '%#v': expected: '%#v'", actualImage.Bounds(), c.ExpectedImage.Bounds()) |
|||
} |
|||
|
|||
if actualAspectRatio != c.ExpectedAspectRatio { |
|||
t.Errorf("aspect ratio incorrect: '%#v': expected: '%#v'", actualAspectRatio, c.ExpectedAspectRatio) |
|||
} |
|||
} |
|||
} |
|||
|
|||
func TestPad(t *testing.T) { |
|||
type padCase struct { |
|||
TestImage image.Image |
|||
OutputSize image.Point |
|||
AspectRatio image.Point |
|||
Expected image.Image |
|||
} |
|||
|
|||
cases := []padCase{ |
|||
{ |
|||
TestImage: image.NewRGBA(image.Rect(0, 0, 1000, 667)), |
|||
OutputSize: image.Point{560, 560}, |
|||
AspectRatio: image.Point{2, 2}, |
|||
Expected: image.NewRGBA(image.Rect(0, 0, 1120, 1120)), |
|||
}, |
|||
} |
|||
|
|||
for _, c := range cases { |
|||
actual := padImage(c.TestImage, c.OutputSize, c.AspectRatio) |
|||
|
|||
if actual.Bounds() != c.Expected.Bounds() { |
|||
t.Errorf("image size incorrect: '%#v': expected: '%#v'", actual.Bounds(), c.Expected.Bounds()) |
|||
} |
|||
} |
|||
} |
|||
|
|||
func TestPackImages(t *testing.T) { |
|||
type packCase struct { |
|||
TestImage image.Image |
|||
AspectRatio image.Point |
|||
ExpectedVals int |
|||
} |
|||
|
|||
cases := []packCase{ |
|||
{ |
|||
TestImage: image.NewRGBA(image.Rect(0, 0, 1120, 1120)), |
|||
AspectRatio: image.Point{2, 2}, |
|||
ExpectedVals: 2 * 2 * 3 * 560 * 560, |
|||
}, |
|||
{ |
|||
TestImage: image.NewRGBA(image.Rect(0, 0, 560, 560)), |
|||
AspectRatio: image.Point{1, 1}, |
|||
ExpectedVals: 1 * 1 * 3 * 560 * 560, |
|||
}, |
|||
{ |
|||
TestImage: image.NewRGBA(image.Rect(0, 0, 1120, 560)), |
|||
AspectRatio: image.Point{1, 2}, |
|||
ExpectedVals: 1 * 2 * 3 * 560 * 560, |
|||
}, |
|||
} |
|||
|
|||
for _, c := range cases { |
|||
actualVals := packImages(c.TestImage, c.AspectRatio) |
|||
if len(actualVals) != c.ExpectedVals { |
|||
t.Errorf("packed image size incorrect: '%d': expected: '%d'", len(actualVals), c.ExpectedVals) |
|||
} |
|||
} |
|||
} |
|||
|
|||
func TestPreprocess(t *testing.T) { |
|||
type preprocessCase struct { |
|||
TestImage image.Image |
|||
ExpectedVals int |
|||
ExpectedAspectRatioID int |
|||
} |
|||
|
|||
cases := []preprocessCase{ |
|||
{ |
|||
TestImage: image.NewRGBA(image.Rect(0, 0, 10, 10)), |
|||
ExpectedVals: 0, |
|||
ExpectedAspectRatioID: 1, |
|||
}, |
|||
{ |
|||
TestImage: image.NewRGBA(image.Rect(0, 0, 1024, 768)), |
|||
ExpectedVals: 0, |
|||
ExpectedAspectRatioID: 6, |
|||
}, |
|||
} |
|||
|
|||
for _, c := range cases { |
|||
var buf bytes.Buffer |
|||
err := png.Encode(&buf, c.TestImage) |
|||
if err != nil { |
|||
t.Fatal(err) |
|||
} |
|||
|
|||
imgData, opts, err := Preprocess(&buf) |
|||
if err != nil { |
|||
t.Fatalf("error processing: %q", err) |
|||
} |
|||
|
|||
if len(imgData) == 0 { |
|||
t.Errorf("no image data returned") |
|||
} |
|||
|
|||
ar, ok := opts["aspectRatioIndex"] |
|||
if !ok { |
|||
t.Fatalf("no aspect ratio found") |
|||
} |
|||
|
|||
aspectRatioID := ar.(int) |
|||
|
|||
if aspectRatioID != c.ExpectedAspectRatioID { |
|||
t.Errorf("aspect ratio incorrect: '%d': expected: '%d'", aspectRatioID, c.ExpectedAspectRatioID) |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,387 @@ |
|||
package mllama |
|||
|
|||
import ( |
|||
"image" |
|||
"testing" |
|||
|
|||
"github.com/google/go-cmp/cmp" |
|||
) |
|||
|
|||
func TestSupportedAspectRatios(t *testing.T) { |
|||
cases := []struct { |
|||
p ImageProcessor |
|||
want []supportedAspectRatio |
|||
}{ |
|||
{ |
|||
p: ImageProcessor{maxNumTiles: 1}, |
|||
want: []supportedAspectRatio{ |
|||
{1, 1, 1}, |
|||
}, |
|||
}, |
|||
{ |
|||
p: ImageProcessor{maxNumTiles: 2}, |
|||
want: []supportedAspectRatio{ |
|||
{1, 1, 1}, |
|||
{2, 1, 2}, |
|||
{3, 2, 1}, |
|||
}, |
|||
}, |
|||
{ |
|||
p: ImageProcessor{maxNumTiles: 3}, |
|||
want: []supportedAspectRatio{ |
|||
{1, 1, 1}, |
|||
{2, 1, 2}, |
|||
{3, 1, 3}, |
|||
{4, 2, 1}, |
|||
{5, 3, 1}, |
|||
}, |
|||
}, |
|||
{ |
|||
p: ImageProcessor{maxNumTiles: 4}, |
|||
want: []supportedAspectRatio{ |
|||
{1, 1, 1}, |
|||
{2, 1, 2}, |
|||
{3, 1, 3}, |
|||
{4, 1, 4}, |
|||
{5, 2, 1}, |
|||
{6, 2, 2}, |
|||
{7, 3, 1}, |
|||
{8, 4, 1}, |
|||
}, |
|||
}, |
|||
} |
|||
|
|||
for _, tt := range cases { |
|||
actual := tt.p.supportedAspectRatios() |
|||
if diff := cmp.Diff(actual, tt.want, cmp.AllowUnexported(supportedAspectRatio{})); diff != "" { |
|||
t.Errorf("mismatch (-got +want):\n%s", diff) |
|||
} |
|||
} |
|||
} |
|||
|
|||
func TestFitToCanvas(t *testing.T) { |
|||
cases := []struct { |
|||
p ImageProcessor |
|||
image image.Point |
|||
canvas image.Point |
|||
expect image.Point |
|||
}{ |
|||
{ |
|||
p: ImageProcessor{imageSize: 200}, |
|||
image: image.Point{400, 400}, |
|||
canvas: image.Point{640, 480}, |
|||
expect: image.Point{400, 400}, |
|||
}, |
|||
{ |
|||
p: ImageProcessor{imageSize: 200}, |
|||
image: image.Point{1024, 768}, |
|||
canvas: image.Point{640, 480}, |
|||
expect: image.Point{640, 480}, |
|||
}, |
|||
{ |
|||
p: ImageProcessor{imageSize: 750}, |
|||
image: image.Point{500, 500}, |
|||
canvas: image.Point{1000, 1000}, |
|||
expect: image.Point{750, 750}, |
|||
}, |
|||
{ |
|||
p: ImageProcessor{imageSize: 2000}, |
|||
image: image.Point{500, 1000}, |
|||
canvas: image.Point{2000, 2000}, |
|||
expect: image.Point{1000, 2000}, |
|||
}, |
|||
{ |
|||
p: ImageProcessor{imageSize: 1000}, |
|||
image: image.Point{4000, 3000}, |
|||
canvas: image.Point{2000, 1000}, |
|||
expect: image.Point{1333, 1000}, |
|||
}, |
|||
{ |
|||
p: ImageProcessor{imageSize: 560}, |
|||
image: image.Point{667, 1000}, |
|||
canvas: image.Point{1000, 1000}, |
|||
expect: image.Point{667, 1000}, |
|||
}, |
|||
} |
|||
|
|||
for _, tt := range cases { |
|||
actual := tt.p.fitToCanvas(tt.image, tt.canvas) |
|||
if diff := cmp.Diff(actual, tt.expect); diff != "" { |
|||
t.Errorf("mismatch (-got +want):\n%s", diff) |
|||
} |
|||
} |
|||
} |
|||
|
|||
func TestOptimalTiledCanvas(t *testing.T) { |
|||
cases := []struct { |
|||
p ImageProcessor |
|||
image image.Point |
|||
expect image.Point |
|||
}{ |
|||
{ |
|||
p: ImageProcessor{maxNumTiles: 4, imageSize: 1000}, |
|||
image: image.Point{1024, 768}, |
|||
expect: image.Point{2000, 1000}, |
|||
}, |
|||
{ |
|||
p: ImageProcessor{maxNumTiles: 4, imageSize: 560}, |
|||
image: image.Point{1024, 768}, |
|||
expect: image.Point{1120, 1120}, |
|||
}, |
|||
{ |
|||
p: ImageProcessor{maxNumTiles: 4, imageSize: 560}, |
|||
image: image.Point{800, 600}, |
|||
expect: image.Point{1120, 1120}, |
|||
}, |
|||
{ |
|||
p: ImageProcessor{maxNumTiles: 4, imageSize: 560}, |
|||
image: image.Point{640, 480}, |
|||
expect: image.Point{1120, 560}, |
|||
}, |
|||
{ |
|||
p: ImageProcessor{maxNumTiles: 4, imageSize: 560}, |
|||
image: image.Point{320, 200}, |
|||
expect: image.Point{560, 560}, |
|||
}, |
|||
{ |
|||
p: ImageProcessor{maxNumTiles: 4, imageSize: 560}, |
|||
image: image.Point{1320, 200}, |
|||
expect: image.Point{1680, 560}, |
|||
}, |
|||
{ |
|||
p: ImageProcessor{maxNumTiles: 4, imageSize: 560}, |
|||
image: image.Point{2000, 200}, |
|||
expect: image.Point{2240, 560}, |
|||
}, |
|||
{ |
|||
p: ImageProcessor{maxNumTiles: 4, imageSize: 560}, |
|||
image: image.Point{10000, 200}, |
|||
expect: image.Point{2240, 560}, |
|||
}, |
|||
{ |
|||
p: ImageProcessor{maxNumTiles: 4, imageSize: 560}, |
|||
image: image.Point{480, 640}, |
|||
expect: image.Point{560, 1120}, |
|||
}, |
|||
{ |
|||
p: ImageProcessor{maxNumTiles: 4, imageSize: 560}, |
|||
image: image.Point{200, 320}, |
|||
expect: image.Point{560, 560}, |
|||
}, |
|||
{ |
|||
p: ImageProcessor{maxNumTiles: 4, imageSize: 560}, |
|||
image: image.Point{200, 1320}, |
|||
expect: image.Point{560, 1680}, |
|||
}, |
|||
{ |
|||
p: ImageProcessor{maxNumTiles: 4, imageSize: 560}, |
|||
image: image.Point{200, 2000}, |
|||
expect: image.Point{560, 2240}, |
|||
}, |
|||
{ |
|||
p: ImageProcessor{maxNumTiles: 4, imageSize: 560}, |
|||
image: image.Point{200, 10000}, |
|||
expect: image.Point{560, 2240}, |
|||
}, |
|||
{ |
|||
p: ImageProcessor{maxNumTiles: 4, imageSize: 560}, |
|||
image: image.Point{10000, 10000}, |
|||
expect: image.Point{1120, 1120}, |
|||
}, |
|||
} |
|||
|
|||
for _, tt := range cases { |
|||
actual := tt.p.optimalTiledCanvas(tt.image) |
|||
if diff := cmp.Diff(actual, tt.expect); diff != "" { |
|||
t.Errorf("mismatch (-got +want):\n%s", diff) |
|||
} |
|||
} |
|||
} |
|||
|
|||
func TestSplitToTiles(t *testing.T) { |
|||
cases := []struct { |
|||
imageMax image.Point |
|||
numTiles image.Point |
|||
expect []image.Image |
|||
}{ |
|||
{ |
|||
imageMax: image.Point{1024, 768}, |
|||
numTiles: image.Point{1, 1}, |
|||
expect: []image.Image{image.NewRGBA(image.Rect(0, 0, 1024, 768))}, |
|||
}, |
|||
{ |
|||
imageMax: image.Point{1000, 500}, |
|||
numTiles: image.Point{2, 1}, |
|||
expect: []image.Image{ |
|||
image.NewRGBA(image.Rect(0, 0, 500, 500)), |
|||
image.NewRGBA(image.Rect(500, 0, 1000, 500)), |
|||
}, |
|||
}, |
|||
{ |
|||
imageMax: image.Point{1000, 1000}, |
|||
numTiles: image.Point{2, 2}, |
|||
expect: []image.Image{ |
|||
image.NewRGBA(image.Rect(0, 0, 500, 500)), |
|||
image.NewRGBA(image.Rect(500, 0, 1000, 500)), |
|||
image.NewRGBA(image.Rect(0, 500, 500, 1000)), |
|||
image.NewRGBA(image.Rect(500, 500, 1000, 1000)), |
|||
}, |
|||
}, |
|||
} |
|||
|
|||
var p ImageProcessor |
|||
|
|||
for _, tt := range cases { |
|||
actual := p.splitToTiles(image.NewRGBA(image.Rectangle{Max: tt.imageMax}), tt.numTiles) |
|||
|
|||
if len(actual) != len(tt.expect) { |
|||
t.Errorf("incorrect number of images '%d': expect: '%d'", len(actual), len(tt.expect)) |
|||
} |
|||
|
|||
for i := range actual { |
|||
if actual[i].Bounds() != tt.expect[i].Bounds() { |
|||
t.Errorf("image size incorrect: '%#v': expect: '%#v'", actual[i].Bounds(), tt.expect[i].Bounds()) |
|||
} |
|||
} |
|||
} |
|||
} |
|||
|
|||
func TestResize(t *testing.T) { |
|||
cases := []struct { |
|||
p ImageProcessor |
|||
imageMax image.Point |
|||
expectImage image.Image |
|||
expectAspectRatio image.Point |
|||
}{ |
|||
{ |
|||
p: ImageProcessor{maxNumTiles: 1, imageSize: 100}, |
|||
imageMax: image.Point{200, 200}, |
|||
expectImage: image.NewRGBA(image.Rect(0, 0, 100, 100)), |
|||
expectAspectRatio: image.Point{1, 1}, |
|||
}, |
|||
{ |
|||
p: ImageProcessor{maxNumTiles: 2, imageSize: 100}, |
|||
imageMax: image.Point{200, 200}, |
|||
expectImage: image.NewRGBA(image.Rect(0, 0, 100, 100)), |
|||
expectAspectRatio: image.Point{1, 1}, |
|||
}, |
|||
{ |
|||
p: ImageProcessor{maxNumTiles: 4, imageSize: 560}, |
|||
imageMax: image.Point{10, 10}, |
|||
expectImage: image.NewRGBA(image.Rect(0, 0, 560, 560)), |
|||
expectAspectRatio: image.Point{1, 1}, |
|||
}, |
|||
{ |
|||
p: ImageProcessor{maxNumTiles: 4, imageSize: 560}, |
|||
imageMax: image.Point{2560, 1920}, |
|||
expectImage: image.NewRGBA(image.Rect(0, 0, 1120, 840)), |
|||
expectAspectRatio: image.Point{2, 2}, |
|||
}, |
|||
{ |
|||
p: ImageProcessor{maxNumTiles: 4, imageSize: 560}, |
|||
imageMax: image.Point{1024, 768}, |
|||
expectImage: image.NewRGBA(image.Rect(0, 0, 1024, 768)), |
|||
expectAspectRatio: image.Point{2, 2}, |
|||
}, |
|||
} |
|||
|
|||
for _, tt := range cases { |
|||
actualImage, actualAspectRatio := tt.p.resize(image.Rectangle{Max: tt.imageMax}) |
|||
|
|||
if actualImage.Bounds() != tt.expectImage.Bounds() { |
|||
t.Errorf("image size incorrect: '%#v': expect: '%#v'", actualImage.Bounds(), tt.expectImage.Bounds()) |
|||
} |
|||
|
|||
if actualAspectRatio != tt.expectAspectRatio { |
|||
t.Errorf("aspect ratio incorrect: '%#v': expect: '%#v'", actualAspectRatio, tt.expectAspectRatio) |
|||
} |
|||
} |
|||
} |
|||
|
|||
func TestPad(t *testing.T) { |
|||
cases := []struct { |
|||
p ImageProcessor |
|||
imageMax image.Point |
|||
aspectRatio image.Point |
|||
expect image.Image |
|||
}{ |
|||
{ |
|||
p: ImageProcessor{maxNumTiles: 4, imageSize: 560}, |
|||
imageMax: image.Point{1000, 667}, |
|||
aspectRatio: image.Point{2, 2}, |
|||
expect: image.NewRGBA(image.Rect(0, 0, 1120, 1120)), |
|||
}, |
|||
} |
|||
|
|||
for _, tt := range cases { |
|||
actual := tt.p.pad(image.Rectangle{Max: tt.imageMax}, tt.aspectRatio) |
|||
|
|||
if actual.Bounds() != tt.expect.Bounds() { |
|||
t.Errorf("image size incorrect: '%#v': expect: '%#v'", actual.Bounds(), tt.expect.Bounds()) |
|||
} |
|||
} |
|||
} |
|||
|
|||
func TestPackImages(t *testing.T) { |
|||
cases := []struct { |
|||
imageMax image.Point |
|||
aspectRatio image.Point |
|||
expectVals int |
|||
}{ |
|||
{ |
|||
imageMax: image.Point{1120, 1120}, |
|||
aspectRatio: image.Point{2, 2}, |
|||
expectVals: 2 * 2 * 3 * 560 * 560, |
|||
}, |
|||
{ |
|||
imageMax: image.Point{560, 560}, |
|||
aspectRatio: image.Point{1, 1}, |
|||
expectVals: 1 * 1 * 3 * 560 * 560, |
|||
}, |
|||
{ |
|||
imageMax: image.Point{1120, 560}, |
|||
aspectRatio: image.Point{1, 2}, |
|||
expectVals: 1 * 2 * 3 * 560 * 560, |
|||
}, |
|||
} |
|||
|
|||
for _, tt := range cases { |
|||
var p ImageProcessor |
|||
actualVals := p.pack(image.NewRGBA(image.Rectangle{Max: tt.imageMax}), tt.aspectRatio) |
|||
if len(actualVals) != tt.expectVals { |
|||
t.Errorf("packed image size incorrect: '%d': expect: '%d'", len(actualVals), tt.expectVals) |
|||
} |
|||
} |
|||
} |
|||
|
|||
func TestPreprocess(t *testing.T) { |
|||
cases := []struct { |
|||
imageMax image.Point |
|||
expectAspectRatioID int |
|||
}{ |
|||
{ |
|||
imageMax: image.Point{10, 10}, |
|||
expectAspectRatioID: 1, |
|||
}, |
|||
{ |
|||
imageMax: image.Point{1024, 768}, |
|||
expectAspectRatioID: 6, |
|||
}, |
|||
} |
|||
|
|||
p := ImageProcessor{imageSize: 560, maxNumTiles: 4} |
|||
for _, tt := range cases { |
|||
img, aspectRatio, err := p.ProcessImage(image.NewRGBA(image.Rectangle{Max: tt.imageMax})) |
|||
if err != nil { |
|||
t.Fatalf("error processing: %q", err) |
|||
} |
|||
|
|||
if len(img) == 0 { |
|||
t.Errorf("no image data returned") |
|||
} |
|||
|
|||
if aspectRatio.rank != tt.expectAspectRatioID { |
|||
t.Errorf("aspect ratio incorrect: '%d': expect: '%d'", aspectRatio, tt.expectAspectRatioID) |
|||
} |
|||
} |
|||
} |
|||
Loading…
Reference in new issue