mirror of https://gitee.com/namelin2022/ollama
12 changed files with 43 additions and 58 deletions
@ -1,32 +0,0 @@ |
|||||
From 8c0ea847ac1460bca534d92266e3471cb31471be Mon Sep 17 00:00:00 2001 |
|
||||
From: Bruce MacDonald <brucewmacdonald@gmail.com> |
|
||||
Date: Tue, 5 Sep 2023 16:05:08 -0400 |
|
||||
Subject: [PATCH] metal: add missing barriers for mul-mat #2699 |
|
||||
|
|
||||
---
|
|
||||
ggml-metal.metal | 2 ++ |
|
||||
1 file changed, 2 insertions(+) |
|
||||
|
|
||||
diff --git a/ggml-metal.metal b/ggml-metal.metal
|
|
||||
index 3f31252..ce3541f 100644
|
|
||||
--- a/ggml-metal.metal
|
|
||||
+++ b/ggml-metal.metal
|
|
||||
@@ -1850,6 +1850,7 @@ kernel void kernel_mul_mm(device const uchar * src0,
|
|
||||
//load data and store to threadgroup memory |
|
||||
half4x4 temp_a; |
|
||||
dequantize_func(x, il, temp_a); |
|
||||
+ threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
||||
#pragma unroll(16) |
|
||||
for (int i = 0; i < 16; i++) { |
|
||||
*(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \ |
|
||||
@@ -1895,6 +1896,7 @@ kernel void kernel_mul_mm(device const uchar * src0,
|
|
||||
} |
|
||||
} else { |
|
||||
// block is smaller than 64x32, we should avoid writing data outside of the matrix |
|
||||
+ threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
||||
threadgroup float *temp_str = ((threadgroup float *)shared_memory) \ |
|
||||
+ 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M; |
|
||||
for (int i = 0; i < 8; i++) { |
|
||||
--
|
|
||||
2.39.2 (Apple Git-143) |
|
||||
|
|
||||
@ -0,0 +1,27 @@ |
|||||
|
From 5dd02993e8cc2ce309157736b95bb572f274a3fd Mon Sep 17 00:00:00 2001 |
||||
|
From: Michael Yang <mxyng@pm.me> |
||||
|
Date: Wed, 20 Sep 2023 14:19:52 -0700 |
||||
|
Subject: [PATCH] copy cuda runtime libraries |
||||
|
|
||||
|
---
|
||||
|
CMakeLists.txt | 4 ++++ |
||||
|
1 file changed, 4 insertions(+) |
||||
|
|
||||
|
diff --git a/CMakeLists.txt b/CMakeLists.txt
|
||||
|
index 824d9f2..dd24137 100644
|
||||
|
--- a/CMakeLists.txt
|
||||
|
+++ b/CMakeLists.txt
|
||||
|
@@ -274,6 +274,10 @@ if (LLAMA_CUBLAS)
|
||||
|
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt) |
||||
|
endif() |
||||
|
|
||||
|
+ configure_file(${CUDAToolkit_LIBRARY_DIR}/libcudart.so ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/libcudart.so.${CUDAToolkit_VERSION_MAJOR}.0 COPYONLY)
|
||||
|
+ configure_file(${CUDAToolkit_LIBRARY_DIR}/libcublas.so ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/libcublas.so.${CUDAToolkit_VERSION_MAJOR} COPYONLY)
|
||||
|
+ configure_file(${CUDAToolkit_LIBRARY_DIR}/libcublasLt.so ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/libcublasLt.so.${CUDAToolkit_VERSION_MAJOR} COPYONLY)
|
||||
|
+
|
||||
|
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES) |
||||
|
# 52 == lowest CUDA 12 standard |
||||
|
# 60 == f16 CUDA intrinsics |
||||
|
--
|
||||
|
2.42.0 |
||||
|
|
||||
Loading…
Reference in new issue