mirror of https://gitee.com/namelin2022/ollama
committed by
GitHub
10 changed files with 820 additions and 117 deletions
@ -0,0 +1,111 @@ |
|||
package imageproc |
|||
|
|||
import ( |
|||
"image" |
|||
"image/color" |
|||
|
|||
"golang.org/x/image/draw" |
|||
) |
|||
|
|||
var ( |
|||
ImageNetDefaultMean = [3]float32{0.485, 0.456, 0.406} |
|||
ImageNetDefaultSTD = [3]float32{0.229, 0.224, 0.225} |
|||
ImageNetStandardMean = [3]float32{0.5, 0.5, 0.5} |
|||
ImageNetStandardSTD = [3]float32{0.5, 0.5, 0.5} |
|||
ClipDefaultMean = [3]float32{0.48145466, 0.4578275, 0.40821073} |
|||
ClipDefaultSTD = [3]float32{0.26862954, 0.26130258, 0.27577711} |
|||
) |
|||
|
|||
const ( |
|||
ResizeBilinear = iota |
|||
ResizeNearestNeighbor |
|||
ResizeApproxBilinear |
|||
ResizeCatmullrom |
|||
) |
|||
|
|||
// Composite returns an image with the alpha channel removed by drawing over a white background.
|
|||
func Composite(img image.Image) image.Image { |
|||
dst := image.NewRGBA(img.Bounds()) |
|||
|
|||
white := color.RGBA{255, 255, 255, 255} |
|||
draw.Draw(dst, dst.Bounds(), &image.Uniform{white}, image.Point{}, draw.Src) |
|||
draw.Draw(dst, dst.Bounds(), img, img.Bounds().Min, draw.Over) |
|||
|
|||
return dst |
|||
} |
|||
|
|||
// Resize returns an image which has been scaled to a new size.
|
|||
func Resize(img image.Image, newSize image.Point, method int) image.Image { |
|||
dst := image.NewRGBA(image.Rect(0, 0, newSize.X, newSize.Y)) |
|||
|
|||
kernels := map[int]draw.Interpolator{ |
|||
ResizeBilinear: draw.BiLinear, |
|||
ResizeNearestNeighbor: draw.NearestNeighbor, |
|||
ResizeApproxBilinear: draw.ApproxBiLinear, |
|||
ResizeCatmullrom: draw.CatmullRom, |
|||
} |
|||
|
|||
kernel, ok := kernels[method] |
|||
if !ok { |
|||
panic("no resizing method found") |
|||
} |
|||
|
|||
kernel.Scale(dst, dst.Rect, img, img.Bounds(), draw.Over, nil) |
|||
|
|||
return dst |
|||
} |
|||
|
|||
// Normalize returns a slice of float32 containing each of the r, g, b values for an image normalized around a value.
|
|||
func Normalize(img image.Image, mean, std [3]float32, rescale bool, channelFirst bool) []float32 { |
|||
var pixelVals []float32 |
|||
|
|||
bounds := img.Bounds() |
|||
if channelFirst { |
|||
var rVals, gVals, bVals []float32 |
|||
for y := bounds.Min.Y; y < bounds.Max.Y; y++ { |
|||
for x := bounds.Min.X; x < bounds.Max.X; x++ { |
|||
c := img.At(x, y) |
|||
r, g, b, _ := c.RGBA() |
|||
var rVal, gVal, bVal float32 |
|||
if rescale { |
|||
rVal = float32(r>>8) / 255.0 |
|||
gVal = float32(g>>8) / 255.0 |
|||
bVal = float32(b>>8) / 255.0 |
|||
} |
|||
|
|||
rVal = (rVal - mean[0]) / std[0] |
|||
gVal = (gVal - mean[1]) / std[1] |
|||
bVal = (bVal - mean[2]) / std[2] |
|||
|
|||
rVals = append(rVals, rVal) |
|||
gVals = append(gVals, gVal) |
|||
bVals = append(bVals, bVal) |
|||
} |
|||
} |
|||
|
|||
pixelVals = append(pixelVals, rVals...) |
|||
pixelVals = append(pixelVals, gVals...) |
|||
pixelVals = append(pixelVals, bVals...) |
|||
} else { |
|||
for y := bounds.Min.Y; y < bounds.Max.Y; y++ { |
|||
for x := bounds.Min.X; x < bounds.Max.X; x++ { |
|||
c := img.At(x, y) |
|||
r, g, b, _ := c.RGBA() |
|||
var rVal, gVal, bVal float32 |
|||
if rescale { |
|||
rVal = float32(r>>8) / 255.0 |
|||
gVal = float32(g>>8) / 255.0 |
|||
bVal = float32(b>>8) / 255.0 |
|||
} |
|||
|
|||
rVal = (rVal - mean[0]) / std[0] |
|||
gVal = (gVal - mean[1]) / std[1] |
|||
bVal = (bVal - mean[2]) / std[2] |
|||
|
|||
pixelVals = append(pixelVals, rVal, gVal, bVal) |
|||
} |
|||
} |
|||
} |
|||
|
|||
return pixelVals |
|||
} |
|||
@ -0,0 +1,177 @@ |
|||
package imageproc |
|||
|
|||
import ( |
|||
"image" |
|||
"image/color" |
|||
"image/draw" |
|||
"reflect" |
|||
"testing" |
|||
) |
|||
|
|||
func createImage(width, height int, fillCol color.RGBA) image.Image { |
|||
img := image.NewRGBA(image.Rect(0, 0, width, height)) |
|||
draw.Draw(img, img.Bounds(), &image.Uniform{fillCol}, image.Point{}, draw.Src) |
|||
return img |
|||
} |
|||
|
|||
func TestComposite(t *testing.T) { |
|||
tests := []struct { |
|||
name string |
|||
img image.Image |
|||
expectedRGBA color.RGBA |
|||
}{ |
|||
{ |
|||
name: "Transparent image", |
|||
img: createImage(5, 5, color.RGBA{0, 0, 0, 0}), |
|||
expectedRGBA: color.RGBA{255, 255, 255, 255}, |
|||
}, |
|||
{ |
|||
name: "Solid red image", |
|||
img: createImage(5, 5, color.RGBA{255, 0, 0, 255}), |
|||
expectedRGBA: color.RGBA{255, 0, 0, 255}, |
|||
}, |
|||
} |
|||
|
|||
for _, tt := range tests { |
|||
t.Run(tt.name, func(t *testing.T) { |
|||
resultImg := Composite(tt.img) |
|||
|
|||
// Check the pixel values in the resulting image
|
|||
for x := range resultImg.Bounds().Dx() { |
|||
for y := range resultImg.Bounds().Dy() { |
|||
r, g, b, a := resultImg.At(x, y).RGBA() |
|||
expectedR, expectedG, expectedB, expectedA := tt.expectedRGBA.RGBA() |
|||
|
|||
if r != expectedR || g != expectedG || b != expectedB || a != expectedA { |
|||
t.Errorf("Pixel mismatch at (%d, %d): got (%d, %d, %d, %d), want (%d, %d, %d, %d)", |
|||
x, y, r, g, b, a, expectedR, expectedG, expectedB, expectedA) |
|||
} |
|||
} |
|||
} |
|||
}) |
|||
} |
|||
} |
|||
|
|||
func TestResize(t *testing.T) { |
|||
tests := []struct { |
|||
name string |
|||
img image.Image |
|||
newSize image.Point |
|||
method int |
|||
expected image.Point |
|||
}{ |
|||
{ |
|||
name: "Resize with bilinear interpolation", |
|||
img: createImage(5, 5, color.RGBA{255, 0, 0, 255}), |
|||
newSize: image.Point{10, 10}, |
|||
method: ResizeBilinear, |
|||
expected: image.Point{10, 10}, |
|||
}, |
|||
{ |
|||
name: "Resize with nearest neighbor", |
|||
img: createImage(10, 10, color.RGBA{0, 255, 0, 255}), |
|||
newSize: image.Point{5, 5}, |
|||
method: ResizeNearestNeighbor, |
|||
expected: image.Point{5, 5}, |
|||
}, |
|||
{ |
|||
name: "Resize with catmullrom", |
|||
img: createImage(1024, 1024, color.RGBA{0, 0, 255, 255}), |
|||
newSize: image.Point{10, 10}, |
|||
method: ResizeCatmullrom, |
|||
expected: image.Point{10, 10}, |
|||
}, |
|||
{ |
|||
name: "Resize with approx bilinear", |
|||
img: createImage(1024, 768, color.RGBA{100, 100, 100, 255}), |
|||
newSize: image.Point{4, 3}, |
|||
method: ResizeApproxBilinear, |
|||
expected: image.Point{4, 3}, |
|||
}, |
|||
} |
|||
for _, tt := range tests { |
|||
t.Run(tt.name, func(t *testing.T) { |
|||
resizedImg := Resize(tt.img, tt.newSize, tt.method) |
|||
|
|||
if resizedImg.Bounds().Dx() != tt.expected.X || resizedImg.Bounds().Dy() != tt.expected.Y { |
|||
t.Errorf("Unexpected size for resized image: got (%d, %d), want (%d, %d)", |
|||
resizedImg.Bounds().Dx(), resizedImg.Bounds().Dy(), tt.expected.X, tt.expected.Y) |
|||
} |
|||
}) |
|||
} |
|||
} |
|||
|
|||
func TestResizeInvalidMethod(t *testing.T) { |
|||
defer func() { |
|||
if r := recover(); r == nil { |
|||
t.Errorf("Expected panic for invalid resizing method, but did not panic") |
|||
} |
|||
}() |
|||
|
|||
img := createImage(10, 10, color.RGBA{0, 0, 0, 255}) |
|||
Resize(img, image.Point{5, 5}, -1) |
|||
} |
|||
|
|||
func TestNormalize(t *testing.T) { |
|||
tests := []struct { |
|||
name string |
|||
img image.Image |
|||
mean [3]float32 |
|||
std [3]float32 |
|||
rescale bool |
|||
channelFirst bool |
|||
expected []float32 |
|||
}{ |
|||
{ |
|||
name: "Rescale with channel first", |
|||
img: createImage(2, 2, color.RGBA{128, 128, 128, 255}), |
|||
mean: ImageNetStandardMean, |
|||
std: ImageNetStandardSTD, |
|||
rescale: true, |
|||
channelFirst: true, |
|||
expected: []float32{ |
|||
0.003921628, 0.003921628, 0.003921628, 0.003921628, // R values
|
|||
0.003921628, 0.003921628, 0.003921628, 0.003921628, // G values
|
|||
0.003921628, 0.003921628, 0.003921628, 0.003921628, // B values
|
|||
}, |
|||
}, |
|||
{ |
|||
name: "Rescale without channel first", |
|||
img: createImage(2, 2, color.RGBA{255, 0, 0, 255}), |
|||
mean: [3]float32{0.0, 0.0, 0.0}, |
|||
std: [3]float32{1.0, 1.0, 1.0}, |
|||
rescale: true, |
|||
channelFirst: false, |
|||
expected: []float32{ |
|||
1.0, 0.0, 0.0, |
|||
1.0, 0.0, 0.0, |
|||
1.0, 0.0, 0.0, |
|||
1.0, 0.0, 0.0, |
|||
}, |
|||
}, |
|||
{ |
|||
name: "No rescale with mean/std adjustment", |
|||
img: createImage(2, 2, color.RGBA{100, 150, 200, 255}), |
|||
mean: ClipDefaultMean, |
|||
std: ClipDefaultSTD, |
|||
rescale: false, |
|||
channelFirst: false, |
|||
expected: []float32{ |
|||
-1.7922626, -1.7520971, -1.4802198, |
|||
-1.7922626, -1.7520971, -1.4802198, |
|||
-1.7922626, -1.7520971, -1.4802198, |
|||
-1.7922626, -1.7520971, -1.4802198, |
|||
}, |
|||
}, |
|||
} |
|||
|
|||
for _, tt := range tests { |
|||
t.Run(tt.name, func(t *testing.T) { |
|||
result := Normalize(tt.img, tt.mean, tt.std, tt.rescale, tt.channelFirst) |
|||
|
|||
if !reflect.DeepEqual(result, tt.expected) { |
|||
t.Errorf("Test %s failed: got %v, want %v", tt.name, result, tt.expected) |
|||
} |
|||
}) |
|||
} |
|||
} |
|||
@ -0,0 +1,68 @@ |
|||
package pixtral |
|||
|
|||
import ( |
|||
"fmt" |
|||
"image" |
|||
_ "image/jpeg" |
|||
_ "image/png" |
|||
"io" |
|||
"math" |
|||
|
|||
"github.com/ollama/ollama/model/imageproc" |
|||
) |
|||
|
|||
func getNumImageTokens(imageSize, patchSize image.Point) image.Point { |
|||
return image.Point{ |
|||
(imageSize.X-1)/patchSize.X + 1, |
|||
(imageSize.Y-1)/patchSize.Y + 1, |
|||
} |
|||
} |
|||
|
|||
func getResizeOutputImageSize(img image.Image, longestEdge int, patchSize image.Point) image.Point { |
|||
b := img.Bounds() |
|||
le := float64(longestEdge) |
|||
ratio := math.Max(float64(b.Max.Y)/le, float64(b.Max.X)/le) |
|||
|
|||
newSize := img.Bounds().Max |
|||
|
|||
if ratio > 1.0 { |
|||
newSize = image.Point{ |
|||
int(math.Ceil(float64(b.Max.X) / ratio)), |
|||
int(math.Ceil(float64(b.Max.Y) / ratio)), |
|||
} |
|||
} |
|||
|
|||
tokens := getNumImageTokens(newSize, patchSize) |
|||
return image.Point{ |
|||
tokens.X * patchSize.X, |
|||
tokens.Y * patchSize.Y, |
|||
} |
|||
} |
|||
|
|||
func resizeImage(img image.Image, format string, longestEdge int, patchSize image.Point) image.Image { |
|||
if format == "png" { |
|||
img = imageproc.Composite(img) |
|||
} |
|||
|
|||
newSize := getResizeOutputImageSize(img, longestEdge, patchSize) |
|||
|
|||
// todo should be ResizeBicubic, but it doesn't exist
|
|||
return imageproc.Resize(img, newSize, imageproc.ResizeBilinear) |
|||
} |
|||
|
|||
func Preprocess(imageData io.Reader) ([]float32, map[string]any, error) { |
|||
img, format, err := image.Decode(imageData) |
|||
if err != nil { |
|||
return nil, nil, fmt.Errorf("failed to decode image: %w", err) |
|||
} |
|||
|
|||
longestEdge := 1024 |
|||
patchSize := image.Point{16, 16} |
|||
|
|||
img = resizeImage(img, format, longestEdge, patchSize) |
|||
|
|||
data := imageproc.Normalize(img, imageproc.ClipDefaultMean, imageproc.ClipDefaultSTD, true, true) |
|||
|
|||
opts := map[string]any{} |
|||
return data, opts, nil |
|||
} |
|||
@ -0,0 +1,219 @@ |
|||
package pixtral |
|||
|
|||
import ( |
|||
"bytes" |
|||
"encoding/binary" |
|||
"image" |
|||
"image/png" |
|||
"math" |
|||
"os" |
|||
"testing" |
|||
|
|||
"github.com/google/go-cmp/cmp" |
|||
) |
|||
|
|||
func TestGetNumImageTokens(t *testing.T) { |
|||
type numImageTokensCase struct { |
|||
ImageSize image.Point |
|||
PatchSize image.Point |
|||
Expected image.Point |
|||
} |
|||
|
|||
cases := []numImageTokensCase{ |
|||
{ |
|||
ImageSize: image.Point{1024, 764}, |
|||
PatchSize: image.Point{16, 16}, |
|||
Expected: image.Point{64, 48}, |
|||
}, |
|||
{ |
|||
ImageSize: image.Point{800, 600}, |
|||
PatchSize: image.Point{16, 16}, |
|||
Expected: image.Point{50, 38}, |
|||
}, |
|||
{ |
|||
ImageSize: image.Point{640, 480}, |
|||
PatchSize: image.Point{16, 16}, |
|||
Expected: image.Point{40, 30}, |
|||
}, |
|||
{ |
|||
ImageSize: image.Point{320, 200}, |
|||
PatchSize: image.Point{16, 16}, |
|||
Expected: image.Point{20, 13}, |
|||
}, |
|||
{ |
|||
ImageSize: image.Point{1320, 200}, |
|||
PatchSize: image.Point{16, 16}, |
|||
Expected: image.Point{83, 13}, |
|||
}, |
|||
{ |
|||
ImageSize: image.Point{2000, 200}, |
|||
PatchSize: image.Point{16, 16}, |
|||
Expected: image.Point{125, 13}, |
|||
}, |
|||
{ |
|||
ImageSize: image.Point{10000, 200}, |
|||
PatchSize: image.Point{16, 16}, |
|||
Expected: image.Point{625, 13}, |
|||
}, |
|||
{ |
|||
ImageSize: image.Point{1131, 577}, |
|||
PatchSize: image.Point{16, 16}, |
|||
Expected: image.Point{71, 37}, |
|||
}, |
|||
{ |
|||
ImageSize: image.Point{16, 16}, |
|||
PatchSize: image.Point{16, 16}, |
|||
Expected: image.Point{1, 1}, |
|||
}, |
|||
} |
|||
|
|||
for _, c := range cases { |
|||
actual := getNumImageTokens(c.ImageSize, c.PatchSize) |
|||
|
|||
if diff := cmp.Diff(actual, c.Expected); diff != "" { |
|||
t.Errorf("mismatch (-got +want):\n%s", diff) |
|||
} |
|||
} |
|||
} |
|||
|
|||
func TestGetResizeOutputImageSize(t *testing.T) { |
|||
type resizeCase struct { |
|||
Image image.Image |
|||
LongestEdge int |
|||
PatchSize image.Point |
|||
Expected image.Point |
|||
} |
|||
|
|||
cases := []resizeCase{ |
|||
{ |
|||
Image: image.NewRGBA(image.Rect(0, 0, 1024, 768)), |
|||
LongestEdge: 1024, |
|||
PatchSize: image.Point{16, 16}, |
|||
Expected: image.Point{1024, 768}, |
|||
}, |
|||
{ |
|||
Image: image.NewRGBA(image.Rect(0, 0, 1162, 690)), |
|||
LongestEdge: 1024, |
|||
PatchSize: image.Point{16, 16}, |
|||
Expected: image.Point{1024, 624}, |
|||
}, |
|||
{ |
|||
Image: image.NewRGBA(image.Rect(0, 0, 300, 200)), |
|||
LongestEdge: 1024, |
|||
PatchSize: image.Point{16, 16}, |
|||
Expected: image.Point{304, 208}, |
|||
}, |
|||
{ |
|||
Image: image.NewRGBA(image.Rect(0, 0, 1862, 522)), |
|||
LongestEdge: 1024, |
|||
PatchSize: image.Point{16, 16}, |
|||
Expected: image.Point{1024, 288}, |
|||
}, |
|||
} |
|||
|
|||
for _, c := range cases { |
|||
actual := getResizeOutputImageSize(c.Image, c.LongestEdge, c.PatchSize) |
|||
|
|||
if diff := cmp.Diff(actual, c.Expected); diff != "" { |
|||
t.Errorf("mismatch (-got +want):\n%s", diff) |
|||
} |
|||
} |
|||
} |
|||
|
|||
func TestResize(t *testing.T) { |
|||
type resizeCase struct { |
|||
Image image.Image |
|||
LongestEdge int |
|||
PatchSize image.Point |
|||
Expected image.Image |
|||
} |
|||
|
|||
cases := []resizeCase{ |
|||
{ |
|||
Image: image.NewRGBA(image.Rect(0, 0, 1862, 522)), |
|||
LongestEdge: 1024, |
|||
PatchSize: image.Point{16, 16}, |
|||
Expected: image.NewRGBA(image.Rect(0, 0, 1024, 288)), |
|||
}, |
|||
{ |
|||
Image: image.NewRGBA(image.Rect(0, 0, 10, 10)), |
|||
LongestEdge: 1024, |
|||
PatchSize: image.Point{16, 16}, |
|||
Expected: image.NewRGBA(image.Rect(0, 0, 16, 16)), |
|||
}, |
|||
} |
|||
|
|||
for _, c := range cases { |
|||
actual := resizeImage(c.Image, "png", c.LongestEdge, c.PatchSize) |
|||
|
|||
if actual.Bounds() != c.Expected.Bounds() { |
|||
t.Errorf("image size incorrect: '%#v': expected: '%#v'", actual.Bounds(), c.Expected.Bounds()) |
|||
} |
|||
} |
|||
} |
|||
|
|||
func TestPreprocess(t *testing.T) { |
|||
type preprocessCase struct { |
|||
TestImage image.Image |
|||
ExpectedLen int |
|||
} |
|||
|
|||
cases := []preprocessCase{ |
|||
{ |
|||
TestImage: image.NewRGBA(image.Rect(0, 0, 10, 10)), |
|||
ExpectedLen: 16 * 16 * 3 * 1, |
|||
}, |
|||
{ |
|||
TestImage: image.NewRGBA(image.Rect(0, 0, 2000, 2000)), |
|||
ExpectedLen: 1024 * 1024 * 3 * 1, |
|||
}, |
|||
} |
|||
|
|||
for _, c := range cases { |
|||
var buf bytes.Buffer |
|||
err := png.Encode(&buf, c.TestImage) |
|||
if err != nil { |
|||
t.Fatal(err) |
|||
} |
|||
|
|||
imgData, _, err := Preprocess(&buf) |
|||
if err != nil { |
|||
t.Fatalf("error processing: %q", err) |
|||
} |
|||
|
|||
switch len(imgData) { |
|||
case 0: |
|||
t.Errorf("no image data returned") |
|||
case c.ExpectedLen: |
|||
// ok
|
|||
default: |
|||
t.Errorf("unexpected image data length: %d, expected: %d", len(imgData), c.ExpectedLen) |
|||
} |
|||
} |
|||
} |
|||
|
|||
func TestPreprocessImages(t *testing.T) { |
|||
for _, testFile := range []string{"flight.png", "sportsball.png"} { |
|||
f, err := os.Open(testFile) |
|||
if err != nil { |
|||
t.Skipf("skipping test, no test image found at %s", testFile) |
|||
} |
|||
defer f.Close() |
|||
|
|||
imgData, _, err := Preprocess(f) |
|||
if err != nil { |
|||
t.Fatalf("error processing: %q", err) |
|||
} |
|||
|
|||
byteData := make([]byte, len(imgData)*4) // float32 is 4 bytes
|
|||
for i, f := range imgData { |
|||
binary.LittleEndian.PutUint32(byteData[i*4:], math.Float32bits(f)) |
|||
} |
|||
|
|||
outputPath := "processed_" + testFile + ".bin" |
|||
err = os.WriteFile(outputPath, byteData, 0o644) |
|||
if err != nil { |
|||
t.Fatalf("error writing processed image: %q", err) |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,74 @@ |
|||
package qwen2vl |
|||
|
|||
import ( |
|||
"fmt" |
|||
"image" |
|||
_ "image/jpeg" |
|||
_ "image/png" |
|||
"io" |
|||
"math" |
|||
|
|||
"github.com/ollama/ollama/model/imageproc" |
|||
) |
|||
|
|||
const ( |
|||
DefaultFactor = 28 |
|||
DefaultMinPixels = 56 * 56 |
|||
DefaultMaxPixels = 14 * 14 * 4 * 1280 |
|||
) |
|||
|
|||
// smartResize calculates the size of the image to resize to based on the
|
|||
// factor, minPixels, and maxPixels.
|
|||
func smartResize(size image.Point, factor, minPixels, maxPixels int) image.Point { |
|||
// 1. Both dimensions of size are divisible by factor
|
|||
// 2. The area of the image is between minPixels and maxPixels
|
|||
// 3. The aspect ratio of the image is as close to 1:1 as possible
|
|||
|
|||
if size.Y < factor || size.X < factor { |
|||
panic("image is too small to resize") |
|||
} else if max(size.X, size.Y)/min(size.X, size.Y) > 200 { |
|||
panic("aspect ratio must be less than 200:1") |
|||
} |
|||
|
|||
f := float64(factor) |
|||
width := float64(size.X) |
|||
height := float64(size.Y) |
|||
|
|||
xBar := math.Round(width/f) * f |
|||
yBar := math.Round(height/f) * f |
|||
|
|||
if xBar*yBar > float64(maxPixels) { |
|||
beta := math.Sqrt(height * width / float64(maxPixels)) |
|||
xBar = math.Floor(width/beta/f) * f |
|||
yBar = math.Floor(height/beta/f) * f |
|||
} else if xBar*yBar < float64(minPixels) { |
|||
beta := math.Sqrt(float64(minPixels) / (height * width)) |
|||
xBar = math.Ceil(width*beta/f) * f |
|||
yBar = math.Ceil(height*beta/f) * f |
|||
} |
|||
|
|||
return image.Point{int(xBar), int(yBar)} |
|||
} |
|||
|
|||
func resizeImage(img image.Image, format string, size image.Point) image.Image { |
|||
if format == "png" { |
|||
img = imageproc.Composite(img) |
|||
} |
|||
|
|||
return imageproc.Resize(img, size, imageproc.ResizeBilinear) |
|||
} |
|||
|
|||
func Preprocess(imageData io.Reader) ([]float32, map[string]any, error) { |
|||
img, format, err := image.Decode(imageData) |
|||
if err != nil { |
|||
return nil, nil, fmt.Errorf("failed to decode image: %w", err) |
|||
} |
|||
|
|||
size := smartResize(img.Bounds().Max, DefaultFactor, DefaultMinPixels, DefaultMaxPixels) |
|||
img = resizeImage(img, format, size) |
|||
|
|||
data := imageproc.Normalize(img, imageproc.ClipDefaultMean, imageproc.ClipDefaultSTD, true, true) |
|||
|
|||
opts := map[string]any{} |
|||
return data, opts, nil |
|||
} |
|||
@ -0,0 +1,78 @@ |
|||
package qwen2vl |
|||
|
|||
import ( |
|||
"bytes" |
|||
"image" |
|||
"image/png" |
|||
"testing" |
|||
) |
|||
|
|||
func TestSmartResize(t *testing.T) { |
|||
type smartResizeCase struct { |
|||
TestImage image.Image |
|||
Expected image.Point |
|||
} |
|||
|
|||
cases := []smartResizeCase{ |
|||
{ |
|||
TestImage: image.NewRGBA(image.Rect(0, 0, 1024, 1024)), |
|||
Expected: image.Point{980, 980}, |
|||
}, |
|||
{ |
|||
TestImage: image.NewRGBA(image.Rect(0, 0, 1024, 768)), |
|||
Expected: image.Point{1036, 756}, |
|||
}, |
|||
{ |
|||
TestImage: image.NewRGBA(image.Rect(0, 0, 2000, 2000)), |
|||
Expected: image.Point{980, 980}, |
|||
}, |
|||
} |
|||
|
|||
for _, c := range cases { |
|||
b := c.TestImage.Bounds().Max |
|||
actual := smartResize(b, DefaultFactor, DefaultMinPixels, DefaultMaxPixels) |
|||
if actual != c.Expected { |
|||
t.Errorf("expected: %v, actual: %v", c.Expected, actual) |
|||
} |
|||
} |
|||
} |
|||
|
|||
func TestPreprocess(t *testing.T) { |
|||
type preprocessCase struct { |
|||
TestImage image.Image |
|||
ExpectedLen int |
|||
} |
|||
|
|||
cases := []preprocessCase{ |
|||
{ |
|||
TestImage: image.NewRGBA(image.Rect(0, 0, 256, 256)), |
|||
ExpectedLen: 252 * 252 * 3 * 1, |
|||
}, |
|||
{ |
|||
TestImage: image.NewRGBA(image.Rect(0, 0, 2000, 2000)), |
|||
ExpectedLen: 980 * 980 * 3 * 1, |
|||
}, |
|||
} |
|||
|
|||
for _, c := range cases { |
|||
var buf bytes.Buffer |
|||
err := png.Encode(&buf, c.TestImage) |
|||
if err != nil { |
|||
t.Fatal(err) |
|||
} |
|||
|
|||
imgData, _, err := Preprocess(&buf) |
|||
if err != nil { |
|||
t.Fatalf("error processing: %q", err) |
|||
} |
|||
|
|||
switch len(imgData) { |
|||
case 0: |
|||
t.Errorf("no image data returned") |
|||
case c.ExpectedLen: |
|||
// ok
|
|||
default: |
|||
t.Errorf("unexpected image data length: %d, expected: %d", len(imgData), c.ExpectedLen) |
|||
} |
|||
} |
|||
} |
|||
Loading…
Reference in new issue