mirror of https://gitee.com/namelin2022/ollama
Browse Source
* llama: wire up builtin runner This adds a new entrypoint into the ollama CLI to run the cgo built runner. On Mac arm64, this will have GPU support, but on all other platforms it will be the lowest common denominator CPU build. After we fully transition to the new Go runners more tech-debt can be removed and we can stop building the "default" runner via make and rely on the builtin always. * build: Make target improvements Add a few new targets and help for building locally. This also adjusts the runner lookup to favor local builds, then runners relative to the executable, and finally payloads. * Support customized CPU flags for runners This implements a simplified custom CPU flags pattern for the runners. When built without overrides, the runner name contains the vector flag we check for (AVX) to ensure we don't try to run on unsupported systems and crash. If the user builds a customized set, we omit the naming scheme and don't check for compatibility. This avoids checking requirements at runtime, so that logic has been removed as well. This can be used to build GPU runners with no vector flags, or CPU/GPU runners with additional flags (e.g. AVX512) enabled. * Use relative paths If the user checks out the repo in a path that contains spaces, make gets really confused so use relative paths for everything in-repo to avoid breakage. * Remove payloads from main binary * install: clean up prior libraries This removes support for v0.3.6 and older versions (before the tar bundle) and ensures we clean up prior libraries before extracting the bundle(s). Without this change, runners and dependent libraries could leak when we update and lead to subtle runtime errors.parth/cmd-cleanup-SO
committed by
GitHub
60 changed files with 891 additions and 1182 deletions
@ -1,4 +1,105 @@ |
|||
GOALS := $(or $(MAKECMDGOALS),all) |
|||
.PHONY: $(GOALS) |
|||
$(GOALS): |
|||
$(MAKE) -C llama $@ |
|||
# top level makefile for Ollama
|
|||
include make/common-defs.make |
|||
|
|||
|
|||
# Determine which if any GPU runners we should build
|
|||
include make/cuda-v11-defs.make |
|||
include make/cuda-v12-defs.make |
|||
include make/rocm-defs.make |
|||
|
|||
ifeq ($(CUSTOM_CPU_FLAGS),) |
|||
ifneq ($(OS),darwin) |
|||
ifeq ($(ARCH),amd64) |
|||
RUNNER_TARGETS=cpu |
|||
endif |
|||
endif |
|||
# Without CUSTOM_CPU_FLAGS we default to build both v11 and v12 if present
|
|||
ifeq ($(OLLAMA_SKIP_CUDA_GENERATE),) |
|||
ifneq ($(CUDA_11_COMPILER),) |
|||
RUNNER_TARGETS += cuda_v11 |
|||
endif |
|||
ifneq ($(CUDA_12_COMPILER),) |
|||
RUNNER_TARGETS += cuda_v12 |
|||
endif |
|||
endif |
|||
else # CUSTOM_CPU_FLAGS is set, we'll build only the latest cuda version detected
|
|||
ifneq ($(CUDA_12_COMPILER),) |
|||
RUNNER_TARGETS += cuda_v12 |
|||
else ifneq ($(CUDA_11_COMPILER),) |
|||
RUNNER_TARGETS += cuda_v11 |
|||
endif |
|||
endif |
|||
|
|||
ifeq ($(OLLAMA_SKIP_ROCM_GENERATE),) |
|||
ifneq ($(HIP_COMPILER),) |
|||
RUNNER_TARGETS += rocm |
|||
endif |
|||
endif |
|||
|
|||
|
|||
all: runners exe |
|||
|
|||
dist: $(addprefix dist_, $(RUNNER_TARGETS)) dist_exe |
|||
|
|||
dist_%: |
|||
@$(MAKE) --no-print-directory -f make/Makefile.$* dist |
|||
|
|||
runners: $(RUNNER_TARGETS) |
|||
|
|||
$(RUNNER_TARGETS): |
|||
@$(MAKE) --no-print-directory -f make/Makefile.$@ |
|||
|
|||
exe dist_exe: |
|||
@$(MAKE) --no-print-directory -f make/Makefile.ollama $@ |
|||
|
|||
help-sync apply-patches create-patches sync sync-clean: |
|||
@$(MAKE) --no-print-directory -f make/Makefile.sync $@ |
|||
|
|||
test integration lint: |
|||
@$(MAKE) --no-print-directory -f make/Makefile.test $@ |
|||
|
|||
clean: |
|||
rm -rf $(BUILD_DIR) $(DIST_LIB_DIR) $(OLLAMA_EXE) $(DIST_OLLAMA_EXE) |
|||
go clean -cache |
|||
|
|||
help: |
|||
@echo "The following make targets will help you build Ollama" |
|||
@echo "" |
|||
@echo " make all # (default target) Build Ollama llm subprocess runners, and the primary ollama executable" |
|||
@echo " make runners # Build Ollama llm subprocess runners; after you may use 'go build .' to build the primary ollama exectuable" |
|||
@echo " make <runner> # Build specific runners. Enabled: '$(RUNNER_TARGETS)'" |
|||
@echo " make dist # Build the runners and primary ollama executable for distribution" |
|||
@echo " make help-sync # Help information on vendor update targets" |
|||
@echo " make help-runners # Help information on runner targets" |
|||
@echo "" |
|||
@echo "The following make targets will help you test Ollama" |
|||
@echo "" |
|||
@echo " make test # Run unit tests" |
|||
@echo " make integration # Run integration tests. You must 'make all' first" |
|||
@echo " make lint # Run lint and style tests" |
|||
@echo "" |
|||
@echo "For more information see 'docs/development.md'" |
|||
@echo "" |
|||
|
|||
|
|||
help-runners: |
|||
@echo "The following runners will be built based on discovered GPU libraries: '$(RUNNER_TARGETS)'" |
|||
@echo "" |
|||
@echo "GPU Runner CPU Flags: '$(GPU_RUNNER_CPU_FLAGS)' (Override with CUSTOM_CPU_FLAGS)" |
|||
@echo "" |
|||
@echo "# CUDA_PATH sets the location where CUDA toolkits are present" |
|||
@echo "CUDA_PATH=$(CUDA_PATH)" |
|||
@echo " CUDA_11_PATH=$(CUDA_11_PATH)" |
|||
@echo " CUDA_11_COMPILER=$(CUDA_11_COMPILER)" |
|||
@echo " CUDA_12_PATH=$(CUDA_12_PATH)" |
|||
@echo " CUDA_12_COMPILER=$(CUDA_12_COMPILER)" |
|||
@echo "" |
|||
@echo "# HIP_PATH sets the location where the ROCm toolkit is present" |
|||
@echo "HIP_PATH=$(HIP_PATH)" |
|||
@echo " HIP_COMPILER=$(HIP_COMPILER)" |
|||
|
|||
.PHONY: all exe dist help help-sync help-runners test integration lint runners clean $(RUNNER_TARGETS) |
|||
|
|||
# Handy debugging for make variables
|
|||
print-%: |
|||
@echo '$*=$($*)' |
|||
|
|||
@ -1 +0,0 @@ |
|||
This is here to make sure the build/ directory exists for the go:embed command |
|||
@ -1 +0,0 @@ |
|||
This is here to make sure the build/ directory exists for the go:embed command |
|||
@ -1,8 +0,0 @@ |
|||
package build |
|||
|
|||
import "embed" |
|||
|
|||
// Darwin payloads separated by architecture to avoid duplicate payloads when cross compiling
|
|||
|
|||
//go:embed darwin/amd64/*
|
|||
var EmbedFS embed.FS |
|||
@ -1,8 +0,0 @@ |
|||
package build |
|||
|
|||
import "embed" |
|||
|
|||
// Darwin payloads separated by architecture to avoid duplicate payloads when cross compiling
|
|||
|
|||
//go:embed darwin/arm64/*
|
|||
var EmbedFS embed.FS |
|||
@ -1,6 +0,0 @@ |
|||
package build |
|||
|
|||
import "embed" |
|||
|
|||
//go:embed linux/*
|
|||
var EmbedFS embed.FS |
|||
@ -1,8 +0,0 @@ |
|||
//go:build !linux && !darwin
|
|||
|
|||
package build |
|||
|
|||
import "embed" |
|||
|
|||
// unused on windows
|
|||
var EmbedFS embed.FS |
|||
@ -1 +0,0 @@ |
|||
This is here to make sure the build/ directory exists for the go:embed command |
|||
@ -1 +0,0 @@ |
|||
This is here to make sure the build/ directory exists for the go:embed command |
|||
@ -0,0 +1,15 @@ |
|||
package main |
|||
|
|||
import ( |
|||
"fmt" |
|||
"os" |
|||
|
|||
"github.com/ollama/ollama/llama/runner" |
|||
) |
|||
|
|||
func main() { |
|||
if err := runner.Execute(os.Args[1:]); err != nil { |
|||
fmt.Fprintf(os.Stderr, "error: %s\n", err) |
|||
os.Exit(1) |
|||
} |
|||
} |
|||
@ -1,57 +0,0 @@ |
|||
# top level makefile for Go server
|
|||
include make/common-defs.make |
|||
|
|||
RUNNER_TARGETS := default |
|||
|
|||
# Determine which if any GPU runners we should build
|
|||
ifeq ($(OS),windows) |
|||
CUDA_PATH?=$(shell cygpath -m -s "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\" 2>/dev/null)unknown |
|||
CUDA_BASE_DIR := $(dir $(shell cygpath -m -s "$(CUDA_PATH)\\.." 2>/dev/null)) |
|||
CUDA_11:=$(shell ls -d $(CUDA_BASE_DIR)/v11.? 2>/dev/null) |
|||
CUDA_12:=$(shell ls -d $(CUDA_BASE_DIR)/v12.? 2>/dev/null) |
|||
HIP_LIB_DIR := $(shell ls -d $(HIP_PATH)/lib 2>/dev/null) |
|||
else ifeq ($(OS),linux) |
|||
HIP_PATH?=/opt/rocm |
|||
HIP_LIB_DIR := $(shell ls -d $(HIP_PATH)/lib 2>/dev/null) |
|||
CUDA_PATH?=/usr/local/cuda |
|||
CUDA_11:=$(shell ls -d $(CUDA_PATH)-11 2>/dev/null) |
|||
CUDA_12:=$(shell ls -d $(CUDA_PATH)-12 2>/dev/null) |
|||
endif |
|||
|
|||
ifeq ($(OLLAMA_SKIP_CUDA_GENERATE),) |
|||
ifneq ($(CUDA_11),) |
|||
RUNNER_TARGETS += cuda_v11 |
|||
endif |
|||
ifneq ($(CUDA_12),) |
|||
RUNNER_TARGETS += cuda_v12 |
|||
endif |
|||
endif |
|||
ifeq ($(OLLAMA_SKIP_ROCM_GENERATE),) |
|||
ifneq ($(HIP_LIB_DIR),) |
|||
RUNNER_TARGETS += rocm |
|||
endif |
|||
endif |
|||
|
|||
|
|||
all: clean-payload .WAIT runners |
|||
|
|||
runners: $(RUNNER_TARGETS) |
|||
|
|||
$(RUNNER_TARGETS): |
|||
$(MAKE) -f make/Makefile.$@ |
|||
|
|||
help-sync apply-patches create-patches sync: |
|||
$(MAKE) -f make/Makefile.sync $@ |
|||
|
|||
clean: |
|||
rm -rf $(BUILD_DIR) $(DIST_RUNNERS) $(PAYLOAD_RUNNERS) |
|||
go clean -cache |
|||
|
|||
clean-payload: |
|||
rm -rf $(addprefix $(RUNNERS_PAYLOAD_DIR)/, $(RUNNER_TARGETS) metal cpu cpu_avx cpu_avx2) |
|||
|
|||
.PHONY: all runners clean clean-payload $(RUNNER_TARGETS) .WAIT |
|||
|
|||
# Handy debugging for make variables
|
|||
print-%: |
|||
@echo '$*=$($*)' |
|||
@ -1,12 +0,0 @@ |
|||
# Build rules for CUDA v11 runner
|
|||
|
|||
include make/common-defs.make |
|||
|
|||
|
|||
GPU_RUNNER_VARIANT := _v11 |
|||
GPU_PATH_ROOT_WIN=$(shell ls -d $(dir $(shell cygpath -m -s "$(CUDA_PATH)\.."))/v11.? 2>/dev/null) |
|||
GPU_PATH_ROOT_LINUX=$(shell ls -d $(CUDA_PATH)-11 2>/dev/null) |
|||
CUDA_ARCHITECTURES?=50;52;53;60;61;62;70;72;75;80;86 |
|||
|
|||
include make/cuda.make |
|||
include make/gpu.make |
|||
@ -1,12 +0,0 @@ |
|||
# Build rules for CUDA v12 runner
|
|||
|
|||
include make/common-defs.make |
|||
|
|||
|
|||
GPU_RUNNER_VARIANT := _v12 |
|||
GPU_PATH_ROOT_WIN=$(shell ls -d $(dir $(shell cygpath -m -s "$(CUDA_PATH)\.."))/v12.? 2>/dev/null) |
|||
GPU_PATH_ROOT_LINUX=$(shell ls -d $(CUDA_PATH)-12 2>/dev/null) |
|||
CUDA_ARCHITECTURES?=60;61;62;70;72;75;80;86;87;89;90;90a |
|||
|
|||
include make/cuda.make |
|||
include make/gpu.make |
|||
@ -1,54 +0,0 @@ |
|||
# Build the default runner(s) for the platform which do not rely on 3rd party GPU libraries
|
|||
# On Mac arm64, this builds the metal runner
|
|||
# On other platforms this builds the CPU runner(s)
|
|||
|
|||
include make/common-defs.make |
|||
|
|||
CPU_GOFLAGS="-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$(VERSION)\" \"-X=github.com/ollama/ollama/llama.CpuFeatures=$(subst $(space),$(comma),$(TARGET_CPU_FLAGS))\" $(TARGET_LDFLAGS)" |
|||
DEFAULT_RUNNER := $(if $(and $(filter darwin,$(OS)),$(filter arm64,$(ARCH))),metal,cpu) |
|||
RUNNERS := $(DEFAULT_RUNNER) |
|||
ifeq ($(ARCH),amd64) |
|||
ifeq ($(CUSTOM_CPU_FLAGS),) |
|||
RUNNERS += cpu_avx cpu_avx2 |
|||
endif |
|||
endif |
|||
|
|||
DIST_RUNNERS = $(addprefix $(RUNNERS_DIST_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT),$(RUNNERS))) |
|||
ifneq ($(OS),windows) |
|||
PAYLOAD_RUNNERS = $(addprefix $(RUNNERS_PAYLOAD_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT).gz,$(RUNNERS))) |
|||
endif |
|||
BUILD_RUNNERS = $(addprefix $(RUNNERS_BUILD_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT),$(RUNNERS))) |
|||
|
|||
all: $(BUILD_RUNNERS) $(DIST_RUNNERS) $(PAYLOAD_RUNNERS) |
|||
|
|||
$(RUNNERS_BUILD_DIR)/$(DEFAULT_RUNNER)/ollama_llama_server$(EXE_EXT): TARGET_CPU_FLAGS=$(CUSTOM_CPU_FLAGS) |
|||
$(RUNNERS_BUILD_DIR)/$(DEFAULT_RUNNER)/ollama_llama_server$(EXE_EXT): *.go ./runner/*.go $(COMMON_SRCS) $(COMMON_HDRS) |
|||
@-mkdir -p $(dir $@) |
|||
GOARCH=$(ARCH) go build -buildmode=pie $(CPU_GOFLAGS) -trimpath $(if $(CUSTOM_CPU_FLAGS),-tags $(subst $(space),$(comma),$(CUSTOM_CPU_FLAGS))) -o $@ ./runner |
|||
|
|||
$(RUNNERS_BUILD_DIR)/cpu_avx/ollama_llama_server$(EXE_EXT): TARGET_CPU_FLAGS="avx" |
|||
$(RUNNERS_BUILD_DIR)/cpu_avx/ollama_llama_server$(EXE_EXT): *.go ./runner/*.go $(COMMON_SRCS) $(COMMON_HDRS) |
|||
@-mkdir -p $(dir $@) |
|||
GOARCH=$(ARCH) go build -buildmode=pie $(CPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(TARGET_CPU_FLAGS)) -o $@ ./runner |
|||
|
|||
$(RUNNERS_BUILD_DIR)/cpu_avx2/ollama_llama_server$(EXE_EXT): TARGET_CPU_FLAGS="avx avx2" |
|||
$(RUNNERS_BUILD_DIR)/cpu_avx2/ollama_llama_server$(EXE_EXT): *.go ./runner/*.go $(COMMON_SRCS) $(COMMON_HDRS) |
|||
@-mkdir -p $(dir $@) |
|||
GOARCH=$(ARCH) go build -buildmode=pie $(CPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(TARGET_CPU_FLAGS)) -o $@ ./runner |
|||
|
|||
$(RUNNERS_DIST_DIR)/%: $(RUNNERS_BUILD_DIR)/% |
|||
@-mkdir -p $(dir $@) |
|||
cp $< $@ |
|||
|
|||
$(RUNNERS_PAYLOAD_DIR)/%/ollama_llama_server$(EXE_EXT).gz: $(RUNNERS_BUILD_DIR)/%/ollama_llama_server$(EXE_EXT) |
|||
@-mkdir -p $(dir $@) |
|||
${GZIP} --best -c $< > $@ |
|||
|
|||
clean: |
|||
rm -f $(BUILD_RUNNERS) $(DIST_RUNNERS) $(PAYLOAD_RUNNERS) |
|||
|
|||
.PHONY: clean all |
|||
|
|||
# Handy debugging for make variables
|
|||
print-%: |
|||
@echo '$*=$($*)' |
|||
@ -1,50 +0,0 @@ |
|||
# Common definitions for all cuda versions |
|||
|
|||
ifndef GPU_RUNNER_VARIANT |
|||
dummy: |
|||
$(error This makefile is not meant to build directly, but instead included in other Makefiles that set required variables) |
|||
endif |
|||
|
|||
|
|||
GPU_RUNNER_NAME := cuda$(GPU_RUNNER_VARIANT) |
|||
GPU_RUNNER_GO_TAGS := cuda cuda$(GPU_RUNNER_VARIANT) |
|||
GPU_RUNNER_DRIVER_LIB_LINK := -lcuda |
|||
GPU_RUNNER_LIBS_SHORT := cublas cudart cublasLt |
|||
GPU_LIB_DIR_WIN = $(GPU_PATH_ROOT_WIN)/bin |
|||
GPU_LIB_DIR_LINUX = $(GPU_PATH_ROOT_LINUX)/lib64 |
|||
CGO_EXTRA_LDFLAGS_WIN = -L"$(GPU_PATH_ROOT_WIN)/lib/x64" |
|||
GPU_COMPILER_WIN = $(GPU_PATH_ROOT_WIN)/bin/nvcc |
|||
GPU_COMPILER_LINUX = $(GPU_PATH_ROOT_LINUX)/bin/nvcc |
|||
GPU_COMPILER_CFLAGS_WIN = $(CFLAGS) -D_WIN32_WINNT=0x602 |
|||
GPU_COMPILER_CFLAGS_LINUX = $(CFLAGS) -Xcompiler -fPIC -D_GNU_SOURCE |
|||
GPU_COMPILER_CXXFLAGS_WIN = $(CXXFLAGS) -D_WIN32_WINNT=0x602 |
|||
GPU_COMPILER_CXXFLAGS_LINUX = $(CXXFLAGS) -Xcompiler -fPIC -D_GNU_SOURCE |
|||
GPU_LIBS = $(sort $(wildcard $(addsuffix *.$(SHARED_EXT)*,$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT))))) |
|||
GPU_DIST_DEPS_LIBS= $(sort $(addprefix $(DIST_GPU_RUNNER_DEPS_DIR)/,$(notdir $(GPU_LIBS)))) |
|||
|
|||
ifeq ($(OS),linux) |
|||
CUDA_PATH?=/usr/local/cuda |
|||
GPU_COMPILER_FPIC = -fPIC -Wno-unused-function -std=c++11 |
|||
endif |
|||
GPU_RUNNER_ARCH_FLAGS := $(foreach arch,$(subst ;,$(space),$(CUDA_ARCHITECTURES)),--generate-code=arch=compute_$(arch)$(comma)code=[compute_$(arch)$(comma)sm_$(arch)]) \ |
|||
-DGGML_CUDA_USE_GRAPHS=1 |
|||
GPU_COMPILER_CUFLAGS = \ |
|||
$(GPU_COMPILER_FPIC) \ |
|||
-Xcompiler "$(addprefix $(CPU_FLAG_PREFIX),$(_OS_GPU_RUNNER_CPU_FLAGS))" \ |
|||
-t2 \ |
|||
-DGGML_CUDA_DMMV_X=32 \ |
|||
-DGGML_CUDA_MMV_Y=1 \ |
|||
-DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 \ |
|||
-DGGML_USE_CUDA=1 \ |
|||
-DGGML_SHARED=1 \ |
|||
-DGGML_BUILD=1 \ |
|||
-DGGML_USE_LLAMAFILE \ |
|||
-DK_QUANTS_PER_ITERATION=2 \ |
|||
-DNDEBUG \ |
|||
-D_GNU_SOURCE \ |
|||
-D_XOPEN_SOURCE=600 \ |
|||
-Wno-deprecated-gpu-targets \ |
|||
--forward-unknown-to-host-compiler \ |
|||
-use_fast_math \ |
|||
-I. \ |
|||
-O3 |
|||
@ -1,122 +0,0 @@ |
|||
# Generalized GPU runner build |
|||
|
|||
ifndef GPU_RUNNER_NAME |
|||
dummy: |
|||
$(error This makefile is not meant to build directly, but instead included in other Makefiles that set required variables) |
|||
endif |
|||
|
|||
ifeq ($(OS),windows) |
|||
GPU_COMPILER:=$(GPU_COMPILER_WIN) |
|||
GPU_LIB_DIR:=$(GPU_LIB_DIR_WIN) |
|||
CGO_EXTRA_LDFLAGS:=$(CGO_EXTRA_LDFLAGS_WIN) |
|||
GPU_COMPILER_CFLAGS = $(GPU_COMPILER_CFLAGS_WIN) |
|||
GPU_COMPILER_CXXFLAGS = $(GPU_COMPILER_CXXFLAGS_WIN) |
|||
else ifeq ($(OS),linux) |
|||
GPU_COMPILER:=$(GPU_COMPILER_LINUX) |
|||
GPU_LIB_DIR:=$(GPU_LIB_DIR_LINUX) |
|||
CGO_EXTRA_LDFLAGS:=$(CGO_EXTRA_LDFLAGS_LINUX) |
|||
GPU_COMPILER_CFLAGS = $(GPU_COMPILER_CFLAGS_LINUX) |
|||
GPU_COMPILER_CXXFLAGS = $(GPU_COMPILER_CXXFLAGS_LINUX) |
|||
endif |
|||
|
|||
GPU_GOFLAGS="-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$(VERSION)\" \"-X=github.com/ollama/ollama/llama.CpuFeatures=$(subst $(space),$(comma),$(GPU_RUNNER_CPU_FLAGS))\" $(TARGET_LDFLAGS)" |
|||
|
|||
# TODO Unify how we handle dependencies in the dist/packaging and install flow |
|||
# today, cuda is bundled, but rocm is split out. Should split them each out by runner |
|||
DIST_GPU_RUNNER_DEPS_DIR = $(DIST_LIB_DIR) |
|||
|
|||
ifeq ($(OS),windows) |
|||
_OS_GPU_RUNNER_CPU_FLAGS=$(call uc,$(GPU_RUNNER_CPU_FLAGS)) |
|||
else ifeq ($(OS),linux) |
|||
_OS_GPU_RUNNER_CPU_FLAGS=$(GPU_RUNNER_CPU_FLAGS) |
|||
endif |
|||
|
|||
GPU_RUNNER_LIBS = $(wildcard $(addsuffix .$(SHARED_EXT).*,$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))) |
|||
DIST_GPU_RUNNER_LIB_DEPS = $(addprefix $(DIST_GPU_RUNNER_DEPS_DIR)/,$(notdir $(GPU_RUNNER_LIBS))) |
|||
|
|||
GPU_RUNNER_SRCS := \ |
|||
ggml-cuda.cu \ |
|||
$(filter-out $(wildcard ggml-cuda/fattn*.cu),$(wildcard ggml-cuda/*.cu)) \ |
|||
$(wildcard ggml-cuda/template-instances/mmq*.cu) \ |
|||
ggml.c ggml-backend.c ggml-alloc.c ggml-quants.c sgemm.cpp ggml-aarch64.c |
|||
GPU_RUNNER_HDRS := \ |
|||
$(wildcard ggml-cuda/*.cuh) |
|||
|
|||
|
|||
# Conditional flags and components to speed up developer builds |
|||
ifneq ($(OLLAMA_FAST_BUILD),) |
|||
GPU_COMPILER_CUFLAGS += \ |
|||
-DGGML_DISABLE_FLASH_ATTN |
|||
else |
|||
GPU_RUNNER_SRCS += \ |
|||
$(wildcard ggml-cuda/fattn*.cu) \ |
|||
$(wildcard ggml-cuda/template-instances/fattn-wmma*.cu) \ |
|||
$(wildcard ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu) \ |
|||
$(wildcard ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu) \ |
|||
$(wildcard ggml-cuda/template-instances/fattn-vec*f16-f16.cu) |
|||
endif |
|||
|
|||
GPU_RUNNER_OBJS := $(GPU_RUNNER_SRCS:.cu=.$(GPU_RUNNER_NAME).$(OBJ_EXT)) |
|||
GPU_RUNNER_OBJS := $(GPU_RUNNER_OBJS:.c=.$(GPU_RUNNER_NAME).$(OBJ_EXT)) |
|||
GPU_RUNNER_OBJS := $(addprefix $(BUILD_DIR)/,$(GPU_RUNNER_OBJS:.cpp=.$(GPU_RUNNER_NAME).$(OBJ_EXT))) |
|||
|
|||
DIST_RUNNERS = $(addprefix $(RUNNERS_DIST_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT),$(GPU_RUNNER_NAME))) |
|||
ifneq ($(OS),windows) |
|||
PAYLOAD_RUNNERS = $(addprefix $(RUNNERS_PAYLOAD_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT).gz,$(GPU_RUNNER_NAME))) |
|||
endif |
|||
BUILD_RUNNERS = $(addprefix $(RUNNERS_BUILD_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT),$(GPU_RUNNER_NAME))) |
|||
|
|||
|
|||
$(GPU_RUNNER_NAME): $(BUILD_RUNNERS) $(DIST_RUNNERS) $(PAYLOAD_RUNNERS) |
|||
|
|||
# Build targets |
|||
$(BUILD_DIR)/%.$(GPU_RUNNER_NAME).$(OBJ_EXT): %.cu |
|||
@-mkdir -p $(dir $@) |
|||
$(CCACHE) $(GPU_COMPILER) -c $(GPU_COMPILER_CFLAGS) $(GPU_COMPILER_CUFLAGS) $(GPU_RUNNER_ARCH_FLAGS) -o $@ $< |
|||
$(BUILD_DIR)/%.$(GPU_RUNNER_NAME).$(OBJ_EXT): %.c |
|||
@-mkdir -p $(dir $@) |
|||
$(CCACHE) $(GPU_COMPILER) -c $(GPU_COMPILER_CFLAGS) -o $@ $< |
|||
$(BUILD_DIR)/%.$(GPU_RUNNER_NAME).$(OBJ_EXT): %.cpp |
|||
@-mkdir -p $(dir $@) |
|||
$(CCACHE) $(GPU_COMPILER) -c $(GPU_COMPILER_CXXFLAGS) -o $@ $< |
|||
$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/ollama_llama_server$(EXE_EXT): TARGET_CGO_LDFLAGS = -L"$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/" $(CGO_EXTRA_LDFLAGS) |
|||
$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/ollama_llama_server$(EXE_EXT): $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT) *.go ./runner/*.go $(COMMON_SRCS) $(COMMON_HDRS) |
|||
@-mkdir -p $(dir $@) |
|||
GOARCH=$(ARCH) CGO_LDFLAGS="$(TARGET_CGO_LDFLAGS)" go build -buildmode=pie $(GPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(GPU_RUNNER_CPU_FLAGS) $(GPU_RUNNER_GO_TAGS)) -o $@ ./runner |
|||
$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT): $(GPU_RUNNER_OBJS) $(DIST_GPU_RUNNER_LIB_DEPS) $(COMMON_HDRS) $(GPU_RUNNER_HDRS) |
|||
@-mkdir -p $(dir $@) |
|||
$(CCACHE) $(GPU_COMPILER) --shared -L$(GPU_LIB_DIR) $(GPU_RUNNER_DRIVER_LIB_LINK) -L${DIST_GPU_RUNNER_DEPS_DIR} $(foreach lib, $(GPU_RUNNER_LIBS_SHORT), -l$(lib)) $(GPU_RUNNER_OBJS) -o $@ |
|||
|
|||
# Distribution targets |
|||
$(RUNNERS_DIST_DIR)/%: $(RUNNERS_BUILD_DIR)/% |
|||
@-mkdir -p $(dir $@) |
|||
$(CP) $< $@ |
|||
$(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)/ollama_llama_server$(EXE_EXT): $(DIST_LIB_DIR)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT) $(GPU_DIST_DEPS_LIBS) |
|||
$(DIST_LIB_DIR)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT): $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT) |
|||
@-mkdir -p $(dir $@) |
|||
$(CP) $< $@ |
|||
$(DIST_GPU_RUNNER_LIB_DEPS): |
|||
@-mkdir -p $(dir $@) |
|||
$(CP) $(GPU_LIB_DIR)/$(notdir $@) $(dir $@) |
|||
$(GPU_DIST_DEPS_LIBS): |
|||
@-mkdir -p $(dir $@) |
|||
$(CP) $(dir $(filter %$(notdir $@),$(GPU_LIBS) $(GPU_TRANSITIVE_LIBS)))/$(notdir $@) $(dir $@) |
|||
|
|||
# Payload targets |
|||
$(RUNNERS_PAYLOAD_DIR)/%/ollama_llama_server.gz: $(RUNNERS_BUILD_DIR)/%/ollama_llama_server |
|||
@-mkdir -p $(dir $@) |
|||
${GZIP} --best -c $< > $@ |
|||
$(RUNNERS_PAYLOAD_DIR)/$(GPU_RUNNER_NAME)/%.gz: $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/% |
|||
@-mkdir -p $(dir $@) |
|||
${GZIP} --best -c $< > $@ |
|||
|
|||
clean: |
|||
rm -f $(GPU_RUNNER_OBJS) $(BUILD_RUNNERS) $(DIST_RUNNERS) $(PAYLOAD_RUNNERS) |
|||
|
|||
.PHONY: clean $(GPU_RUNNER_NAME) |
|||
|
|||
|
|||
# Handy debugging for make variables |
|||
print-%: |
|||
@echo '$*=$($*)' |
|||
|
|||
@ -1,19 +0,0 @@ |
|||
package main |
|||
|
|||
import ( |
|||
"encoding/json" |
|||
"os" |
|||
|
|||
"github.com/ollama/ollama/llama" |
|||
"github.com/ollama/ollama/version" |
|||
) |
|||
|
|||
func printRequirements(fp *os.File) { |
|||
attrs := map[string]string{ |
|||
"system_info": llama.PrintSystemInfo(), |
|||
"version": version.Version, |
|||
"cpu_features": llama.CpuFeatures, |
|||
} |
|||
enc := json.NewEncoder(fp) |
|||
_ = enc.Encode(attrs) |
|||
} |
|||
@ -0,0 +1,40 @@ |
|||
# Build the discrete cpu runner(s) for the platform which do not rely on 3rd party GPU libraries
|
|||
|
|||
include make/common-defs.make |
|||
|
|||
CPU_GOFLAGS="-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$(VERSION)\" \"-X=github.com/ollama/ollama/llama.CpuFeatures=$(subst $(space),$(comma),$(TARGET_CPU_FLAGS))\" $(TARGET_LDFLAGS)" |
|||
ifeq ($(ARCH),amd64) |
|||
ifeq ($(origin CUSTOM_CPU_FLAGS),undefined) |
|||
RUNNERS = cpu_avx cpu_avx2 |
|||
endif |
|||
endif |
|||
|
|||
DIST_RUNNERS = $(addprefix $(RUNNERS_DIST_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT),$(RUNNERS))) |
|||
BUILD_RUNNERS = $(addprefix $(RUNNERS_BUILD_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT),$(RUNNERS))) |
|||
|
|||
cpu: $(BUILD_RUNNERS) |
|||
|
|||
dist: $(DIST_RUNNERS) |
|||
|
|||
$(RUNNERS_BUILD_DIR)/cpu_avx/ollama_llama_server$(EXE_EXT): TARGET_CPU_FLAGS="avx" |
|||
$(RUNNERS_BUILD_DIR)/cpu_avx/ollama_llama_server$(EXE_EXT): ./llama/*.go ./llama/runner/*.go $(COMMON_SRCS) $(COMMON_HDRS) |
|||
@-mkdir -p $(dir $@) |
|||
GOARCH=$(ARCH) go build -buildmode=pie $(CPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(TARGET_CPU_FLAGS)) -o $@ ./cmd/runner |
|||
|
|||
$(RUNNERS_BUILD_DIR)/cpu_avx2/ollama_llama_server$(EXE_EXT): TARGET_CPU_FLAGS="avx avx2" |
|||
$(RUNNERS_BUILD_DIR)/cpu_avx2/ollama_llama_server$(EXE_EXT): ./llama/*.go ./llama/runner/*.go $(COMMON_SRCS) $(COMMON_HDRS) |
|||
@-mkdir -p $(dir $@) |
|||
GOARCH=$(ARCH) go build -buildmode=pie $(CPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(TARGET_CPU_FLAGS)) -o $@ ./cmd/runner |
|||
|
|||
$(RUNNERS_DIST_DIR)/%: $(RUNNERS_BUILD_DIR)/% |
|||
@-mkdir -p $(dir $@) |
|||
cp $< $@ |
|||
|
|||
clean: |
|||
rm -f $(BUILD_RUNNERS) $(DIST_RUNNERS) |
|||
|
|||
.PHONY: clean cpu dist |
|||
|
|||
# Handy debugging for make variables
|
|||
print-%: |
|||
@echo '$*=$($*)' |
|||
@ -0,0 +1,13 @@ |
|||
# Build rules for CUDA v11 runner
|
|||
|
|||
include make/common-defs.make |
|||
include make/cuda-v11-defs.make |
|||
|
|||
GPU_RUNNER_VARIANT := _v11 |
|||
GPU_COMPILER=$(CUDA_11_COMPILER) |
|||
CUDA_ARCHITECTURES?=50;52;53;60;61;62;70;72;75;80;86 |
|||
GPU_LIB_DIR = $(CUDA_11_LIB_DIR) |
|||
CGO_EXTRA_LDFLAGS = $(CUDA_11_CGO_EXTRA_LDFLAGS) |
|||
|
|||
include make/cuda.make |
|||
include make/gpu.make |
|||
@ -0,0 +1,13 @@ |
|||
# Build rules for CUDA v12 runner
|
|||
|
|||
include make/common-defs.make |
|||
include make/cuda-v12-defs.make |
|||
|
|||
GPU_RUNNER_VARIANT := _v12 |
|||
GPU_COMPILER=$(CUDA_12_COMPILER) |
|||
CUDA_ARCHITECTURES?=60;61;62;70;72;75;80;86;87;89;90;90a |
|||
GPU_LIB_DIR = $(CUDA_12_LIB_DIR) |
|||
CGO_EXTRA_LDFLAGS = $(CUDA_12_CGO_EXTRA_LDFLAGS) |
|||
|
|||
include make/cuda.make |
|||
include make/gpu.make |
|||
@ -0,0 +1,19 @@ |
|||
# Makefile for building top-level ollama binary
|
|||
|
|||
include make/common-defs.make |
|||
|
|||
exe: $(OLLAMA_EXE) |
|||
dist_exe dist_ollama: $(DIST_OLLAMA_EXE) |
|||
|
|||
GO_DEPS=$(foreach dir,$(shell go list -deps -f '{{.Dir}}' . ),$(wildcard $(dir)/*.go)) |
|||
CPU_GOFLAGS="-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$(VERSION)\" \"-X=github.com/ollama/ollama/llama.CpuFeatures=$(subst $(space),$(comma),$(TARGET_CPU_FLAGS))\" $(EXTRA_GOLDLAGS) $(TARGET_LDFLAGS)" |
|||
|
|||
$(OLLAMA_EXE) $(DIST_OLLAMA_EXE): TARGET_CPU_FLAGS=$(CUSTOM_CPU_FLAGS) |
|||
$(OLLAMA_EXE) $(DIST_OLLAMA_EXE): $(COMMON_SRCS) $(COMMON_HDRS) $(GO_DEPS) |
|||
GOARCH=$(ARCH) go build -buildmode=pie $(CPU_GOFLAGS) -trimpath $(if $(CUSTOM_CPU_FLAGS),-tags $(subst $(space),$(comma),$(CUSTOM_CPU_FLAGS))) -o $@ . |
|||
|
|||
.PHONY: ollama dist_ollama exe dist_exe |
|||
|
|||
# Handy debugging for make variables
|
|||
print-%: |
|||
@echo '$*=$($*)' |
|||
@ -0,0 +1,19 @@ |
|||
# Targets to assist in running tests
|
|||
|
|||
include make/common-defs.make |
|||
|
|||
test: |
|||
cd .. && go test ./... |
|||
|
|||
integration: $(OLLAMA_EXE) |
|||
cd .. && go test --tags=integration ./integration -v |
|||
|
|||
lint: |
|||
cd .. && golangci-lint run -v |
|||
|
|||
# Note: in this makefile we error instead of building to allow more fine-grain control of testing flows
|
|||
$(OLLAMA_EXE): |
|||
@echo "" |
|||
@echo "ERROR: You must build ollama first - use 'make all' to build the ollama binaries" |
|||
@echo "" |
|||
@exit 1 |
|||
@ -0,0 +1,17 @@ |
|||
# Common definitions for the various Makefiles which set cuda settings |
|||
# No rules are defined here so this is safe to include at the beginning of other makefiles |
|||
|
|||
ifeq ($(OS),windows) |
|||
CUDA_PATH?=$(shell cygpath -m -s "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\" 2>/dev/null)unknown |
|||
CUDA_BASE_DIR := $(dir $(shell cygpath -m -s "$(CUDA_PATH)\\.." 2>/dev/null)) |
|||
CUDA_11_PATH:=$(shell ls -d $(CUDA_BASE_DIR)/v11.? 2>/dev/null) |
|||
CUDA_11_COMPILER:=$(wildcard $(CUDA_11_PATH)/bin/nvcc.exe) |
|||
CUDA_11_LIB_DIR = $(strip $(shell ls -d $(CUDA_11_PATH)/bin 2>/dev/null)) |
|||
CUDA_11_CGO_EXTRA_LDFLAGS = -L"$(CUDA_11_PATH)/lib/x64" |
|||
else ifeq ($(OS),linux) |
|||
CUDA_PATH?=/usr/local/cuda |
|||
CUDA_11_PATH:=$(shell ls -d $(CUDA_PATH)-11 2>/dev/null) |
|||
CUDA_11_COMPILER:=$(wildcard $(CUDA_11_PATH)/bin/nvcc) |
|||
CUDA_11_LIB_DIR=$(strip $(shell ls -d $(CUDA_11_PATH)/lib64 2>/dev/null || ls -d $(CUDA_11_PATH)/lib 2>/dev/null)) |
|||
CUDA_11_CGO_EXTRA_LDFLAGS = -L"$(CUDA_11_LIB_DIR)" -L"$(CUDA_11_LIB_DIR)/stubs" |
|||
endif |
|||
@ -0,0 +1,17 @@ |
|||
# Common definitions for the various Makefiles which set cuda settings |
|||
# No rules are defined here so this is safe to include at the beginning of other makefiles |
|||
|
|||
ifeq ($(OS),windows) |
|||
CUDA_PATH?=$(shell cygpath -m -s "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\" 2>/dev/null)unknown |
|||
CUDA_BASE_DIR := $(dir $(shell cygpath -m -s "$(CUDA_PATH)\\.." 2>/dev/null)) |
|||
CUDA_12_PATH:=$(shell ls -d $(CUDA_BASE_DIR)/v12.? 2>/dev/null) |
|||
CUDA_12_COMPILER:=$(wildcard $(CUDA_12_PATH)/bin/nvcc.exe) |
|||
CUDA_12_LIB_DIR = $(strip $(shell ls -d $(CUDA_12_PATH)/bin 2>/dev/null)) |
|||
CUDA_12_CGO_EXTRA_LDFLAGS = -L"$(CUDA_12_PATH)/lib/x64" |
|||
else ifeq ($(OS),linux) |
|||
CUDA_PATH?=/usr/local/cuda |
|||
CUDA_12_PATH:=$(shell ls -d $(CUDA_PATH)-12 2>/dev/null) |
|||
CUDA_12_COMPILER:=$(wildcard $(CUDA_12_PATH)/bin/nvcc) |
|||
CUDA_12_LIB_DIR=$(strip $(shell ls -d $(CUDA_12_PATH)/lib64 2>/dev/null || ls -d $(CUDA_12_PATH)/lib 2>/dev/null)) |
|||
CUDA_12_CGO_EXTRA_LDFLAGS = -L"$(CUDA_12_LIB_DIR)" -L"$(CUDA_12_LIB_DIR)/stubs" |
|||
endif |
|||
@ -0,0 +1,54 @@ |
|||
# Common definitions for all cuda versions |
|||
|
|||
ifndef GPU_RUNNER_VARIANT |
|||
dummy: |
|||
$(error This makefile is not meant to build directly, but instead included in other Makefiles that set required variables) |
|||
endif |
|||
|
|||
|
|||
GPU_RUNNER_NAME := cuda$(GPU_RUNNER_VARIANT) |
|||
GPU_RUNNER_GO_TAGS := cuda cuda$(GPU_RUNNER_VARIANT) |
|||
GPU_RUNNER_DRIVER_LIB_LINK := -lcuda |
|||
GPU_RUNNER_LIBS_SHORT := cublas cudart cublasLt |
|||
|
|||
ifeq ($(OS),windows) |
|||
# On windows, nvcc uses msvc which does not support avx512vbmi avx512vnni avx512bf16, but macros can turn them on |
|||
GPU_VECTOR_FLAGS=$(call uc,$(filter-out avx512bf16,$(filter-out avx512vnni,$(filter-out avx512vbmi,$(GPU_RUNNER_CPU_FLAGS))))) |
|||
GPU_COMPILER_EXTRA_FLAGS=$(if $(filter avx512vbmi,$(GPU_RUNNER_CPU_FLAGS)),-D__AVX512VBMI__) |
|||
GPU_COMPILER_EXTRA_FLAGS+=$(if $(filter avx512vnni,$(GPU_RUNNER_CPU_FLAGS)),-D__AVX512VNNI__) |
|||
GPU_COMPILER_EXTRA_FLAGS+=$(if $(filter avx512bf16,$(GPU_RUNNER_CPU_FLAGS)),-D__AVX512BF16__) |
|||
GPU_LIBS = $(sort $(wildcard $(addsuffix *.$(SHARED_EXT),$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT))))) |
|||
GPU_COMPILER_CFLAGS = $(CFLAGS) -D_WIN32_WINNT=0x602 |
|||
GPU_COMPILER_CXXFLAGS = $(CXXFLAGS) -D_WIN32_WINNT=0x602 |
|||
else ifeq ($(OS),linux) |
|||
# On linux, nvcc requires avx512 -> -mavx512f -mavx512dq -mavx512bw |
|||
GPU_VECTOR_FLAGS=$(if $(filter avx512,$(GPU_RUNNER_CPU_FLAGS)),avx512f avx512dq avx512bw) $(filter-out avx512,$(GPU_RUNNER_CPU_FLAGS)) |
|||
GPU_COMPILER_EXTRA_FLAGS = -fPIC -Wno-unused-function -std=c++11 |
|||
GPU_LIBS = $(sort $(wildcard $(addsuffix *.$(SHARED_EXT).*,$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT))))) |
|||
GPU_COMPILER_CFLAGS = $(CFLAGS) -Xcompiler -fPIC -D_GNU_SOURCE |
|||
GPU_COMPILER_CXXFLAGS = $(CXXFLAGS) -Xcompiler -fPIC -D_GNU_SOURCE |
|||
endif |
|||
GPU_DIST_LIB_DEPS= $(sort $(addprefix $(DIST_GPU_RUNNER_DEPS_DIR)/,$(notdir $(GPU_LIBS)))) |
|||
|
|||
GPU_RUNNER_ARCH_FLAGS := $(foreach arch,$(subst ;,$(space),$(CUDA_ARCHITECTURES)),--generate-code=arch=compute_$(arch)$(comma)code=[compute_$(arch)$(comma)sm_$(arch)]) \ |
|||
-DGGML_CUDA_USE_GRAPHS=1 |
|||
GPU_COMPILER_CUFLAGS = \ |
|||
$(GPU_COMPILER_EXTRA_FLAGS) \ |
|||
-Xcompiler "$(addprefix $(CPU_FLAG_PREFIX),$(GPU_VECTOR_FLAGS))" \ |
|||
-t2 \ |
|||
-DGGML_CUDA_DMMV_X=32 \ |
|||
-DGGML_CUDA_MMV_Y=1 \ |
|||
-DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 \ |
|||
-DGGML_USE_CUDA=1 \ |
|||
-DGGML_SHARED=1 \ |
|||
-DGGML_BUILD=1 \ |
|||
-DGGML_USE_LLAMAFILE \ |
|||
-DK_QUANTS_PER_ITERATION=2 \ |
|||
-DNDEBUG \ |
|||
-D_GNU_SOURCE \ |
|||
-D_XOPEN_SOURCE=600 \ |
|||
-Wno-deprecated-gpu-targets \ |
|||
--forward-unknown-to-host-compiler \ |
|||
-use_fast_math \ |
|||
-I./llama/ \ |
|||
-O3 |
|||
@ -0,0 +1,90 @@ |
|||
# Generalized GPU runner build |
|||
|
|||
ifndef GPU_RUNNER_NAME |
|||
dummy: |
|||
$(error This makefile is not meant to build directly, but instead included in other Makefiles that set required variables) |
|||
endif |
|||
|
|||
GPU_GOFLAGS="-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$(VERSION)\" \"-X=github.com/ollama/ollama/llama.CpuFeatures=$(subst $(space),$(comma),$(GPU_RUNNER_CPU_FLAGS))\" $(EXTRA_GOLDLAGS) $(TARGET_LDFLAGS)" |
|||
|
|||
# TODO Unify how we handle dependencies in the dist/packaging and install flow |
|||
# today, cuda is bundled, but rocm is split out. Should split them each out by runner |
|||
DIST_GPU_RUNNER_DEPS_DIR = $(DIST_LIB_DIR) |
|||
|
|||
|
|||
GPU_RUNNER_LIBS = $(wildcard $(addsuffix .$(SHARED_EXT).*,$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))) |
|||
|
|||
GPU_RUNNER_SRCS := \ |
|||
llama/ggml-cuda.cu \ |
|||
$(filter-out $(wildcard llama/ggml-cuda/fattn*.cu),$(wildcard llama/ggml-cuda/*.cu)) \ |
|||
$(wildcard llama/ggml-cuda/template-instances/mmq*.cu) \ |
|||
llama/ggml.c llama/ggml-backend.c llama/ggml-alloc.c llama/ggml-quants.c llama/sgemm.cpp llama/ggml-aarch64.c |
|||
GPU_RUNNER_HDRS := \ |
|||
$(wildcard llama/ggml-cuda/*.cuh) |
|||
|
|||
|
|||
# Conditional flags and components to speed up developer builds |
|||
ifneq ($(OLLAMA_FAST_BUILD),) |
|||
GPU_COMPILER_CUFLAGS += \ |
|||
-DGGML_DISABLE_FLASH_ATTN |
|||
else |
|||
GPU_RUNNER_SRCS += \ |
|||
$(wildcard llama/ggml-cuda/fattn*.cu) \ |
|||
$(wildcard llama/ggml-cuda/template-instances/fattn-wmma*.cu) \ |
|||
$(wildcard llama/ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu) \ |
|||
$(wildcard llama/ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu) \ |
|||
$(wildcard llama/ggml-cuda/template-instances/fattn-vec*f16-f16.cu) |
|||
endif |
|||
|
|||
GPU_RUNNER_OBJS := $(GPU_RUNNER_SRCS:.cu=.$(GPU_RUNNER_NAME).$(OBJ_EXT)) |
|||
GPU_RUNNER_OBJS := $(GPU_RUNNER_OBJS:.c=.$(GPU_RUNNER_NAME).$(OBJ_EXT)) |
|||
GPU_RUNNER_OBJS := $(addprefix $(BUILD_DIR)/,$(GPU_RUNNER_OBJS:.cpp=.$(GPU_RUNNER_NAME).$(OBJ_EXT))) |
|||
|
|||
DIST_RUNNERS = $(addprefix $(RUNNERS_DIST_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT),$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT))) |
|||
BUILD_RUNNERS = $(addprefix $(RUNNERS_BUILD_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT),$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT))) |
|||
|
|||
|
|||
$(GPU_RUNNER_NAME): $(BUILD_RUNNERS) |
|||
|
|||
dist: $(DIST_RUNNERS) |
|||
|
|||
# Build targets |
|||
$(BUILD_DIR)/%.$(GPU_RUNNER_NAME).$(OBJ_EXT): %.cu |
|||
@-mkdir -p $(dir $@) |
|||
$(CCACHE) $(GPU_COMPILER) -c $(GPU_COMPILER_CFLAGS) $(GPU_COMPILER_CUFLAGS) $(GPU_RUNNER_ARCH_FLAGS) -o $@ $< |
|||
$(BUILD_DIR)/%.$(GPU_RUNNER_NAME).$(OBJ_EXT): %.c |
|||
@-mkdir -p $(dir $@) |
|||
$(CCACHE) $(GPU_COMPILER) -c $(GPU_COMPILER_CFLAGS) -o $@ $< |
|||
$(BUILD_DIR)/%.$(GPU_RUNNER_NAME).$(OBJ_EXT): %.cpp |
|||
@-mkdir -p $(dir $@) |
|||
$(CCACHE) $(GPU_COMPILER) -c $(GPU_COMPILER_CXXFLAGS) -o $@ $< |
|||
$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/ollama_llama_server$(EXE_EXT): TARGET_CGO_LDFLAGS = $(CGO_EXTRA_LDFLAGS) -L"$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/" |
|||
$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/ollama_llama_server$(EXE_EXT): $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT) ./llama/*.go ./llama/runner/*.go $(COMMON_SRCS) $(COMMON_HDRS) |
|||
@-mkdir -p $(dir $@) |
|||
GOARCH=$(ARCH) CGO_LDFLAGS="$(TARGET_CGO_LDFLAGS)" go build -buildmode=pie $(GPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(GPU_RUNNER_CPU_FLAGS) $(GPU_RUNNER_GO_TAGS)) -o $@ ./cmd/runner |
|||
$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT): $(GPU_RUNNER_OBJS) $(COMMON_HDRS) $(GPU_RUNNER_HDRS) |
|||
@-mkdir -p $(dir $@) |
|||
$(CCACHE) $(GPU_COMPILER) --shared -L$(GPU_LIB_DIR) $(GPU_RUNNER_DRIVER_LIB_LINK) -L${DIST_GPU_RUNNER_DEPS_DIR} $(foreach lib, $(GPU_RUNNER_LIBS_SHORT), -l$(lib)) $(GPU_RUNNER_OBJS) -o $@ |
|||
|
|||
# Distribution targets |
|||
$(RUNNERS_DIST_DIR)/%: $(RUNNERS_BUILD_DIR)/% |
|||
@-mkdir -p $(dir $@) |
|||
$(CP) $< $@ |
|||
$(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/ollama_llama_server$(EXE_EXT): $(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT) $(GPU_DIST_LIB_DEPS) |
|||
$(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT): $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT) |
|||
@-mkdir -p $(dir $@) |
|||
$(CP) $< $@ |
|||
$(GPU_DIST_LIB_DEPS): |
|||
@-mkdir -p $(dir $@) |
|||
$(CP) $(GPU_LIB_DIR)/$(notdir $@) $(dir $@) |
|||
|
|||
clean: |
|||
rm -f $(GPU_RUNNER_OBJS) $(BUILD_RUNNERS) $(DIST_RUNNERS) |
|||
|
|||
.PHONY: clean $(GPU_RUNNER_NAME) |
|||
|
|||
|
|||
# Handy debugging for make variables |
|||
print-%: |
|||
@echo '$*=$($*)' |
|||
|
|||
@ -0,0 +1,9 @@ |
|||
# Common definitions for the various Makefiles which set cuda settings |
|||
# No rules are defined here so this is safe to include at the beginning of other makefiles |
|||
|
|||
ifeq ($(OS),windows) |
|||
HIP_COMPILER:=$(wildcard $(HIP_PATH)/bin/hipcc.bin.exe) |
|||
else ifeq ($(OS),linux) |
|||
HIP_PATH?=$(shell ls -d /opt/rocm 2>/dev/null) |
|||
HIP_COMPILER:=$(wildcard $(HIP_PATH)/bin/hipcc) |
|||
endif |
|||
@ -1,50 +0,0 @@ |
|||
package runners |
|||
|
|||
import ( |
|||
"log/slog" |
|||
"os" |
|||
"path" |
|||
"runtime" |
|||
"strings" |
|||
"testing" |
|||
"testing/fstest" |
|||
) |
|||
|
|||
func TestRefreshRunners(t *testing.T) { |
|||
slog.SetLogLoggerLevel(slog.LevelDebug) |
|||
|
|||
payloadFS := fstest.MapFS{ |
|||
path.Join(runtime.GOOS, runtime.GOARCH, "foo", "ollama_llama_server"): {Data: []byte("hello, world\n")}, |
|||
} |
|||
tmpDir, err := os.MkdirTemp("", "testing") |
|||
if err != nil { |
|||
t.Fatalf("failed to make tmp dir %s", err) |
|||
} |
|||
t.Setenv("OLLAMA_TMPDIR", tmpDir) |
|||
rDir, err := Refresh(payloadFS) |
|||
if err != nil { |
|||
t.Fatalf("failed to extract to %s %s", tmpDir, err) |
|||
} |
|||
if !strings.Contains(rDir, tmpDir) { |
|||
t.Fatalf("runner dir %s was not in tmp dir %s", rDir, tmpDir) |
|||
} |
|||
|
|||
// spot check results
|
|||
servers := GetAvailableServers(rDir) |
|||
if len(servers) < 1 { |
|||
t.Fatalf("expected at least 1 server") |
|||
} |
|||
|
|||
// Refresh contents
|
|||
rDir, err = extractRunners(payloadFS) |
|||
if err != nil { |
|||
t.Fatalf("failed to extract to %s %s", tmpDir, err) |
|||
} |
|||
if !strings.Contains(rDir, tmpDir) { |
|||
t.Fatalf("runner dir %s was not in tmp dir %s", rDir, tmpDir) |
|||
} |
|||
|
|||
cleanupTmpDirs() |
|||
|
|||
Cleanup(payloadFS) |
|||
} |
|||
Loading…
Reference in new issue