only thinking about the money and the fame and attention. –Urban Dictionary
a computer that can solve many problems at a fast rate –Urban Dictionary
(let [dot-product (fn [xs ys] (reduce + (map * xs ys))) x-vec (vec (range 100000)) y-vec (vec (range 100000))] (dot-product x-vec y-vec))
333328333350000
Execution time: 14 ms
(require '(uncomplicate.neanderthal [core :refer :all] [native :refer :all]))
(let [x (fv (range 100000)) y (copy x)] (dot x y))
3.33328352E14
(require '[uncomplicate.clojurecuda.core :refer :all] '[uncomplicate.neanderthal.cuda :refer :all] '[uncomplicate.commons.core :refer :all])
(with-default (with-default-engine (with-release [gpu-x (cuv (range 100000)) gpu-y (copy gpu-x)] (dot gpu-x gpu-y))))
3.33328352E14
(with-default (with-default-engine (with-release [gpu-x (entry! (cuge 1000 100000) 0.01) gpu-y (copy (trans gpu-x)) cpu-c (cuge 1000 1000)] (do (mm! 1 gpu-x gpu-y 0 cpu-c) (synchronize!) true))))
true
(init)
true
(device-count)
2
(def my-nvidia-gpu (device 0))
#'user/my-nvidia-gpu
(info my-nvidia-gpu)
{:max-grid-dim-y 65535, :total-mem 11721506816, :name "GeForce GTX 1080 Ti", :max-threads-per-multiprocessor 2048, :max-shared-memory-per-block 49152, :compute-capability-major 6, :global-memory-bus-width 352, :memory-clock-rate 5505000, :max-threads-per-block 1024, :multiprocessor-count 28, :warp-size 32, :max-registers-per-block 65536 ;;... much more data }
(def ctx (context my-nvidia-gpu))
#'user/ctx
(info ctx)
{:dev-runtime-pending-launch-count 2048 :dev-runtime-sync-depth 2 :malloc-heap-size 8388608 :stack-size 1024 :api-version 3020 :stream-priority-range (0 -1) :cache-config :prefer-none :printf-fifo-size 1048576 :device #object(jcuda.driver.CUdevice 0x12be4426 "CUdevice[nativePointer=0x0]") :shared-config :four-byte-bank-size}
(= ctx (current-context))
true
(def gpu-array (mem-alloc 1024))
#'user/gpu-array
(def main-array (float-array (range 256)))
#'user/main-array
(take 10 main-array)
(0 1 2 3 4 5 6 7 8 9)
(memcpy-host! main-array gpu-array)
#object[uncomplicate.clojurecuda.internal.impl.CULinearMemory 0x38701ca4 "uncomplicate.clojurecuda.internal.impl.CULinearMemory@38701ca4"]
(take 12 (memcpy-host! gpu-array (float-array 256)))
(0 1 2 3 4 5 6 7 8 9 10 11)
extern "C" __global__ void increment(int n, float *a) { int i = blockIdx.x * blockDim.x + threadIdx.x; if (i < n) { a[i] = a[i] + 1.0f; } };
(def kernel-source "extern \"C\" __global__ void increment (int n, float *a) { int i = blockIdx.x * blockDim.x + threadIdx.x; if (i < n) { a[i] = a[i] + 1.0f; } };") (def hello-program (compile! (program kernel-source)))
#'user/kernel-source#'user/hello-program
(def hello-module (module hello-program)) (def increment (function hello-module "increment"))
#'user/hello-module#'user/increment
(launch! increment (grid-1d 256) (parameters 256 gpu-array))
nil
(take 12 (memcpy-host! gpu-array (float-array 256)))
(1 2 3 4 5 6 7 8 9 10 11 12)
…instead of writing everything as a huge, impenetrable C++ codebase and then just calling it with dummy wrappers
The presentation can be accessed on my blog:
Find more at: