一、K8s集群部署¶
1.1 Runtime安装¶
1、卸载已经安装的 docker
apt-get remove docker \
docker-client \
docker-client-latest \
docker-common \
docker-latest \
docker-latest-logrotate \
docker-logrotate \
docker-engine docker-ce containerd -y
2、配置源
# 更新源
sudo apt-get update
# 下载工具
sudo apt-get install -y ca-certificates curl gnupg lsb-release
# 创建目录
sudo mkdir -p /etc/apt/keyrings
# 配置源
curl -fsSL https://mirrors.aliyun.com/docker-ce/linux/ubuntu/gpg | sudo gpg --dearmor -o /etc/apt/keyrings/docker.gpg
echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://mirrors.aliyun.com/docker-ce/linux/ubuntu $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
# 更新源
sudo apt-get update
3、安装 Containerd
apt-get install containerd.io -y
4、配置 Containerd 的内核
cat <<EOF | sudo tee /etc/modules-load.d/containerd.conf
overlay
br_netfilter
EOF
sudo modprobe overlay
sudo modprobe br_netfilter
cat <<EOF | sudo tee /etc/sysctl.d/99-kubernetes-cri.conf
net.bridge.bridge-nf-call-iptables = 1
net.ipv4.ip_forward = 1
net.bridge.bridge-nf-call-ip6tables = 1
EOF
# 内核参数生效
sudo sysctl --system
5、创建 Containerd 的配置文件
# 生成配置文件
sudo mkdir -p /etc/containerd
containerd config default | sudo tee /etc/containerd/config.toml
# 修改配置文件
sed -i 's#SystemdCgroup = false#SystemdCgroup = true#g' /etc/containerd/config.toml
sed -i 's#k8s.gcr.io/pause#registry.cn-hangzhou.aliyuncs.com/google_containers/pause#g' /etc/containerd/config.toml
sed -i 's#registry.gcr.io/pause#registry.cn-hangzhou.aliyuncs.com/google_containers/pause#g' /etc/containerd/config.toml
sed -i 's#registry.k8s.io/pause#registry.cn-hangzhou.aliyuncs.com/google_containers/pause#g' /etc/containerd/config.toml
6、启动Containerd
systemctl daemon-reload
systemctl restart containerd
# 验证结果
root@VM-0-10-ubuntu:/home/ubuntu# ctr plugin ls
1.2 Kubernetes 部署¶
1、关闭swap
# 临时关闭
swapoff -a
# 永久关闭,/etc/fstab配置文件注释自动挂载 swapoff
vim /etc/fstab
2、添加 Kubernetes 源
国内:
apt-get update && apt-get install -y apt-transport-https
# 下载并处理 Kubernetes GPG 密钥
sudo curl -fsSL \
https://mirrors.aliyun.com/kubernetes-new/core/stable/v1.31/deb/Release.key | sudo gpg --dearmor -o /etc/apt/keyrings/kubernetes-apt-keyring.gpg
# 添加 Kubernetes 软件源到系统源列表
echo "deb [signed-by=/etc/apt/keyrings/kubernetes-apt-keyring.gpg] https://mirrors.aliyun.com/kubernetes-new/core/stable/v1.31/deb/ /" | sudo tee /etc/apt/sources.list.d/kubernetes.list
国外:
# 安装必要的依赖包
sudo apt install -y apt-transport-https ca-certificates curl gpg
# 下载并处理 Kubernetes 的 GPG 密钥
curl -fsSL https://pkgs.k8s.io/core:/stable:/v1.31/deb/Release.key | sudo gpg --dearmor -o /etc/apt/keyrings/kubernetes-apt-keyring.gpg
# 添加 Kubernetes 软件源到系统源列表,并抑制输出
echo 'deb [signed-by=/etc/apt/keyrings/kubernetes-apt-keyring.gpg] https://pkgs.k8s.io/core:/stable:/v1.31/deb/ /' | sudo tee /etc/apt/sources.list.d/kubernetes.list > /dev/null
3、安装 Kubernetes 组件
apt-get update
apt-get install -y kubelet kubeadm kubectl
sudo apt-mark hold kubelet kubeadm kubectl
4、集群初始化
国内环境:
# 预拉取 Kubernetes 安装所需的镜像
sudo kubeadm config images pull \
--image-repository registry.cn-hangzhou.aliyuncs.com/google_containers --kubernetes-version 1.31.6
# 在 master 节点进行 init,其他节点不要进行 init
sudo kubeadm init --apiserver-advertise-address 192.168.0.104 \
--image-repository registry.cn-hangzhou.aliyuncs.com/google_containers \
--cri-socket "unix:///var/run/containerd/containerd.sock" \
--kubernetes-version 1.31.6
国外环境:
# 预拉取指定版本的 Kubernetes 安装所需镜像
sudo kubeadm config images pull --kubernetes-version 1.31.6
# 在 master 节点进行初始化操作,其他节点请勿执行此步骤
sudo kubeadm init --apiserver-advertise-address 10.224.0.2 --cri-socket "unix:///var/run/containerd/containerd.sock" --kubernetes-version 1.31.6
5、配置 kubeconfig
mkdir -p $HOME/.kube
sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config
sudo chown $(id -u):$(id -g) $HOME/.kube/config
6、安装 Addons
# 安装git工具
apt-get install git -y
# 下载源码文件
git clone https://gitee.com/dukuan/k8s-ha-install.git
# 开始安装
cd k8s-ha-install/
git checkout manual-installation-v1.31.x
cd single/
kubectl create -f .
7、解除污点
这里使用master节点进行测试,所以解除Master节点的污点,让Master节点做node节点。
kubectl taint node node-role.kubernetes.io/control-plane- --all
8、查看集群状态
# 查看node使用情况
kubectl top node
# 查看node节点
kubectl get node
二、K8s GPU Operator部署¶
2.1 Helm 安装¶
官方安装文档:https://helm.sh/docs/intro/install/
Helm 安装包:https://github.com/helm/helm/releases
1、安装Helm
# 下载安装包
mkdir helm && cd helm
wget https://get.helm.sh/helm-v3.16.2-linux-amd64.tar.gz
# 解压安装
tar xf helm-v3.16.2-linux-amd64.tar.gz
mv linux-amd64/helm /usr/local/bin/
# 验证版本
helm version
2、创建Namespace
kubectl create ns gpu-operator
2.2 部署 GPU Operator¶
1、添加仓库
helm repo add nvidia https://helm.ngc.nvidia.com/nvidia \
&& helm repo update
2、下载安装包
# 下载指定版本的gpu-operator
helm pull nvidia/gpu-operator --version v24.9.2
# 解压
tar xf gpu-operator-v24.9.2.tgz
# 进入安装目录
cd gpu-operator/
3、开始安装
helm install gpu-operator -n gpu-operator --create-namespace .
说明:国内机器需要修改 vim charts/node-feature-discovery/values.yaml 仓库地址
4、查看 Pod 状态
kubectl get po -n gpu-operator
5、查看 GPU 资源
kubectl describe node | grep Allocatable: -A 10
Allocatable:
cpu: 8
ephemeral-storage: 95001823485
hugepages-1Gi: 0
hugepages-2Mi: 0
memory: 31973804Ki
pods: 110
System Info:
Machine ID: aa0de61855c940efb7546ad537e45332
System UUID: aa0de618-55c9-40ef-b754-6ad537e45332
Boot ID: d3b4740f-86b1-49ec-8f74-5ef6500f1cbf
6、创建 GPU 测试服务
# 编写yaml文件
vim test.yaml
apiVersion: v1
kind: Pod
metadata:
name: cuda-vectoradd
spec:
restartPolicy: OnFailure
containers:
- name: cuda-vectoradd
image: "nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda11.7.1-ubuntu20.04"
resources:
limits:
nvidia.com/gpu: 1
# 应用
kubectl create -f test.yaml
7、查看日志
kubectl logs cuda-vectoradd
#回显
[Vector addition of 50000 elements]
Copy input data from the host memory to the CUDA device
CUDA kernel launch with 196 blocks of 256 threads
Copy output data from the CUDA device to the host memory
Test PASSED
Done
三、动态存储配置¶
1、Local Path Storage 部署
#创建存储目录
mkdir -p /data/local-path-provisioner
#定制Local Path Storage yaml文件
vim localpath.yaml
apiVersion: v1
kind: Namespace
metadata:
name: local-path-storage
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: local-path-provisioner-service-account
namespace: local-path-storage
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: local-path-provisioner-role
namespace: local-path-storage
rules:
- apiGroups: [""]
resources: ["pods"]
verbs: ["get", "list", "watch", "create", "patch", "update", "delete"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: local-path-provisioner-role
rules:
- apiGroups: [""]
resources: ["nodes", "persistentvolumeclaims", "configmaps", "pods", "pods/log"]
verbs: ["get", "list", "watch"]
- apiGroups: [""]
resources: ["persistentvolumes"]
verbs: ["get", "list", "watch", "create", "patch", "update", "delete"]
- apiGroups: [""]
resources: ["events"]
verbs: ["create", "patch"]
- apiGroups: ["storage.k8s.io"]
resources: ["storageclasses"]
verbs: ["get", "list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: local-path-provisioner-bind
namespace: local-path-storage
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: local-path-provisioner-role
subjects:
- kind: ServiceAccount
name: local-path-provisioner-service-account
namespace: local-path-storage
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: local-path-provisioner-bind
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: local-path-provisioner-role
subjects:
- kind: ServiceAccount
name: local-path-provisioner-service-account
namespace: local-path-storage
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: local-path-provisioner
namespace: local-path-storage
spec:
replicas: 1
selector:
matchLabels:
app: local-path-provisioner
template:
metadata:
labels:
app: local-path-provisioner
spec:
serviceAccountName: local-path-provisioner-service-account
containers:
- name: local-path-provisioner
image: registry.cn-beijing.aliyuncs.com/dotbalo/local-path-provisioner:v0.0.31
imagePullPolicy: IfNotPresent
command:
- local-path-provisioner
- --debug
- start
- --config
- /etc/config/config.json
volumeMounts:
- name: config-volume
mountPath: /etc/config/
env:
- name: POD_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: CONFIG_MOUNT_PATH
value: /etc/config/
volumes:
- name: config-volume
configMap:
name: local-path-config
---
apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
name: local-path
provisioner: rancher.io/local-path
volumeBindingMode: WaitForFirstConsumer
reclaimPolicy: Delete
---
kind: ConfigMap
apiVersion: v1
metadata:
name: local-path-config
namespace: local-path-storage
data:
config.json: |-
{
"nodePathMap":[
{
"node":"DEFAULT_PATH_FOR_NON_LISTED_NODES",
"paths":["/data/local-path-provisioner"]
}
]
}
setup: |-
#!/bin/sh
set -eu
mkdir -m 0777 -p "$VOL_DIR"
teardown: |-
#!/bin/sh
set -eu
rm -rf "$VOL_DIR"
helperPod.yaml: |-
apiVersion: v1
kind: Pod
metadata:
name: helper-pod
spec:
priorityClassName: system-node-critical
tolerations:
- key: node.kubernetes.io/disk-pressure
operator: Exists
effect: NoSchedule
containers:
- name: helper-pod
image: registry.cn-beijing.aliyuncs.com/dotbalo/busybox
imagePullPolicy: IfNotPresent
#应用yaml
kubectl create -f localpath.yaml
2、结果验证
# 查看pod状态
kubectl get po -n local-path-storage
# 查看StorageClass
kubectl get sc local-path
3、创建 PVC 测试
vim pvc-test.yaml
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: local-path-pvc
spec:
accessModes:
- ReadWriteOnce
storageClassName: local-path
resources:
requests:
storage: 128Mi
# 应用
kubectl create -f pvc-test.yaml
4、创建 Pod
# 编写yaml文件
vim pod.yaml
apiVersion: v1
kind: Pod
metadata:
name: volume-test
spec:
containers:
- name: volume-test
image: registry.cn-beijing.aliyuncs.com/dotbalo/redis:v7.0.15
imagePullPolicy: IfNotPresent
volumeMounts:
- name: volv
mountPath: /data
ports:
- containerPort: 80
volumes:
- name: volv
persistentVolumeClaim:
claimName: local-path-pvc
# 应用yaml文件
kubectl create -f pod.yaml
5、验证结果
# 查看pod
kubectl get po volume-test
NAME READY STATUS RESTARTS AGE
volume-test 1/1 Running 0 46s
# 查看pvc
kubectl get pvc local-path-pvc
NAME STATUS VOLUME CAPACITY ACCESS MODES STORAGECLASS VOLUMEATTRIBUTESCLASS AGE
local-path-pvc Bound pvc-36d0c89e-eab7-4960-adff-9494224f50ef 128Mi RWO local-path <unset> 64s
四、K8s Ollama Operator部署¶
官网:https://ollama-operator.ayaka.io/
1、部署Ollama Operator
kubectl apply \
--server-side=true \
-f https://raw.githubusercontent.com/nekomeowww/ollama-operator/v0.10.1/dist/install.yaml
2、查看部署状态
kubectl get po -n ollama-operator-system
3、下载客户端工具并上传到服务器的/home/ubuntu目录
客户端工具下载:https://github.com/nekomeowww/ollama-operator/releases/
cd /home/ubuntu
wget https://github.com/nekomeowww/ollama-operator/releases/download/v0.10.5/kollama_v0.10.5_linux_amd64.tar.gz
4、解压客户端工具
tar xf kollama_v0.10.5_linux_amd64.tar.gz
5、测试客户端工具是否正常使用
./kollama --help