k8s
k8s安装
-
设置系统 参数
# 关闭swap
swapoff -a
sed -i 's/.*swap/#&/' /etc/fstab
# 内核参数
cat <<EOF > /etc/sysctl.d/k8s.conf
net.bridge.bridge-nf-call-ip6tables = 1
net.bridge.bridge-nf-call-iptables = 1
net.ipv4.ip_nonlocal_bind = 1
net.ipv4.ip_forward = 1
vm.swappiness = 0
vm.max_map_count = 262144
net.netfilter.nf_conntrack_max = 1000000
EOF
modprobe br_netfilter
sysctl -p /etc/sysctl.d/k8s.conf
echo "* soft nofile 65536" >> /etc/security/limits.conf
echo "* hard nofile 65536" >> /etc/security/limits.conf -
安装docker
-
脚本安装
curl -fsSL https://get.docker.com/ | sh -s -- --mirror Aliyun
-
yum安装
yum -y install yum-utils
yum-config-manager --add-repo https://mirrors.aliyun.com/docker-ce/linux/centos/docker-ce.repo
yum -y install docker-ce
#修改默认docker仓库
cat <<EOF > /etc/docker/daemon.json
{
"registry-mirrors": ["https://fl791z1h.mirror.aliyuncs.com"]
}
EOF -
-
添加阿里kubernetes源
cat <<EOF > /etc/yum.repos.d/kubernetes.repo
[kubernetes]
name=Kubernetes
baseurl=https://mirrors.aliyun.com/kubernetes/yum/repos/kubernetes-el7-x86_64/
enabled=1
gpgcheck=1
repo_gpgcheck=1
gpgkey=https://mirrors.aliyun.com/kubernetes/yum/doc/yum-key.gpg https://mirrors.aliyun.com/kubernetes/yum/doc/rpm-package-key.gpg
EOF -
安装k8s
-
安装工具
yum install -y kubectl kubelet kubeadm
-
初始化集群
--service-cidr
service ip范围--pod-network-cidr
pod ip范围kubeadm init --kubernetes-version=1.18.0 --apiserver-advertise-address=172.16.7.14 --image-repository registry.aliyuncs.com/google_containers --service-cidr=10.10.0.0/16 --pod-network-cidr=10.244.0.0/16
-
-
安装网络插件
-
安装flannel网络插件
kubectl apply -f https://raw.githubusercontent.com/coreos/flannel/master/Documentation/kube-flannel.yml
-
安装calico网络插件
下载使用ectd数据存储calico.yaml
curl https://docs.projectcalico.org/manifests/calico-etcd.yaml -o calico.yaml
修改kind: Secret的data内容
etcd-key: null --> etcd-key: #改成输出的内容cat /etc/kubernetes/pki/etcd/server.key | base64 -w 0
etcd-cert: null --> etcd-cert: #改成输出的内容cat /etc/kubernetes/pki/etcd/server.crt | base64 -w 0
etcd-ca: null --> etcd-ca: #改成输出的内容cat /etc/kubernetes/pki/etcd/ca.crt | base64 -w 0修改kind: ConfigMap内容data内容
etcd_endpoints: "http://<ETCD_IP>:<ETCD_PORT>" --> etcd_endpoints: "https://172.16.7.14:2379"
etcd_ca: "" # "/calico-secrets/etcd-ca" --> etcd_ca: "/calico-secrets/etcd-ca"
etcd_cert: "" # "/calico-secrets/etcd-cert" --> etcd_cert: "/calico-secrets/etcd-cert"
etcd_key: "" # "/calico-secrets/etcd-key" --> etcd_key: "/calico-secrets/etcd-key"为Calico节点配置IP自动检测,以确保使用正确的IP地址进行路由interface=修改为对应的实际物理网卡
不做该设置会出现
mster calico-node notready
状态以及出现Connect Socket: Connection reset by peer bird: BGP: Unexpected connect from unknown address
kubectl set env daemonset/calico-node -n kube-system IP_AUTODETECTION_METHOD=interface=eth.*
-
-
安装dashboard
安装
kubectl apply -f https://raw.githubusercontent.com/kubernetes/dashboard/v2.0.0/aio/deploy/recommended.yaml
修改service配置,找到type,将ClusterIP改成NodePort 设置nodePort端口
kubectl edit svc kubernetes-dashboard -n kubernetes-dashboard
授权
kubectl create clusterrolebinding serviceaccount-cluster-admin --clusterrole=cluster-admin --user=system:serviceaccount:kubernetes-dashboard:kubernetes-dashboard
获取页面登录token
kubectl describe secrets -n kubernetes-dashboard $(kubectl -n kubernetes-dashboard get secret|grep kubernetes-dashboard-token|awk '{print $1}')| grep token | awk 'NR==3{print $2}'
修改集群启用ipvs模式:
#修改ConfigMap的kube-system/kube-proxy中的config.conf,mode: “ipvs”
kubectl edit cm kube-proxy -n kube-system
#重启各个节点上的kube-proxy pod
kubectl get pod -n kube-system | grep kube-proxy | awk '{system("kubectl delete pod "$1" -n kube-system")}'
启用vip(多master须在每个上面执行):
#定义vip地址必须同在主机网络非使用的ip
export VIP=172.16.7.18
#定义网卡接口主机当前网卡
export INTERFACE=eth0
ctr image pull ghcr.io/kube-vip/kube-vip:v0.4.0
ctr run --rm --net-host ghcr.io/kube-vip/kube-vip:v0.4.0 vip /kube-vip manifest pod \
--interface $INTERFACE \
--vip $VIP \
--controlplane \
--services \
--arp \
--leaderElection | tee /etc/kubernetes/manifests/kube-vip.yaml
安装nfs作为sc:
git clone https://github.com/kubernetes-sigs/nfs-subdir-external-provisioner
#修改deployment.yaml中server地址与path对应实际的nfs地址与目录
cd nfs-subdir-external-provisioner && kubectl apply -f deploy/deployment.yaml deploy/rbac.yaml deploy/class.yaml
#设置nfs默认sc
kubectl patch storageclass managed-nfs-storage -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}'
更改ROLES:
kubectl label --overwrite nodes nodename kubernetes.io/role=node1
修改revision触发滚动更新:
revision=`kubectl get deploy test01-app1 -ojson|jq -r '.metadata.annotations."deployment.kubernetes.io/revision"|tonumber+1'`
kubectl patch deployment test01-app1 -p '{"spec":{"template": {"metadata": {"annotations": {"deployment.kubernetes.io/revision": $revision}}}}}'
kubectl 1.15版本后滚动重启:
kubectl rollout restart deploy myapp-deploy -n ops
ingress-nginx path被带过 ingress配置添加下面配置:
annotations:
nginx.ingress.kubernetes.io/rewrite-target: /
新节点加入集群:
kubeadm token create --print-join-command
导出当前集群配置:
kubeadm config view > k8s.yaml
更新证书:
kubeadm alpha certs renew all --config=k8s.yaml
删除卡住Terminating状态无法删除:
#删除- finalizers.kubesphere.io/namespaces内容即可
kubectl edit namespace myns
#直接patch修改
kubectl patch ns/myns -p '{"metadata":{"finalizers":[]}}' --type=merge
kubectl patch crd/helmcategories.application.kubesphere.io -p '{"metadata":{"finalizers":[]}}' --type=merge
修复/etc/kubernetes所有文件:
kubeadm init phase certs all --config k8s.yml
kubeadm init phase kubeconfig all --config k8s.yml
kubeadm init phase control-plane all --config k8s.yml
kubeadm init phase etcd local --config k8s.yml
#更新cluster-info配置
kubeadm init phase bootstrap-token
#重启控制平面组件
docker ps |grep -E 'k8s_kube-apiserver|k8s_kube-controller-manager|k8s_kube-scheduler|k8s_etcd_etcd' | awk '{print $1}'|xargs docker restart
修复kubelet配置:
systemctl stop kubelet
rm -rf /var/lib/kubelet/pki/ /etc/kubernetes/kubelet.conf
kubeadm init phase kubeconfig kubelet --config k8s.yml
kubeadm init phase kubelet-start --config k8s.yml
etcd无法启动(节点挂掉):
vim /etc/kubernetes/manifests/etcd.yaml
添加参数覆盖旧集群信息,正常启动后可去掉
--force-new-cluster
查看etcd集群信息
etcdctl --cacert=/etc/kubernetes/pki/etcd/ca.crt --cert=/etc/kubernetes/pki/etcd/server.crt --key=/etc/kubernetes/pki/etcd/server.key endpoint status --cluster -w table
kubeshpere安装:
#安装HelmClient
wget https://get.helm.sh/helm-v2.14.3-linux-amd64.tar.gz
tar -zxvf helm-v2.14.3-linux-amd64.tar.gz
mv linux-amd64/helm /usr/local/bin/
#安装TillerServer
helm init --upgrade --tiller-image registry.cn-hangzhou.aliyuncs.com/google_containers/tiller:v2.14.3 --stable-repo-url https://kubernetes.oss-cn-hangzhou.aliyuncs.com/charts
kubectl create serviceaccount --namespace kube-system tiller
kubectl create clusterrolebinding tiller-cluster-rule --clusterrole=cluster-admin --serviceaccount=kube-system:tiller
kubectl patch deploy --namespace kube-system tiller-deploy -p '{"spec":{"template":{"spec":{"serviceAccount":"tiller"}}}}'
etcd
配置etcdctl命令:
alias etcdctl='etcdctl --cacert=/etc/kubernetes/pki/etcd/ca.crt --cert=/etc/kubernetes/pki/etcd/server.crt --key=/etc/kubernetes/pki/etcd/server.key'
查看集群信息:
etcdctl endpoint status --cluster -w table
查看所有key:
etcdctl get / --prefix --keys-only
生成快照备份:
etcdctl snapshot save snapshot.db
恢复备份:
-
单master
etcdctl snapshot restore /root/snapshot.db --data-dir=/var/lib/etcd
-
多master
拷贝备份文件snapshot.db到隔master上
分别在其他master上执行--name使用当前主机名 --initial-advertise-peer-urls 当前的ectd地址
#master1
etcdctl snapshot restore /root/snapshot.db --name master1 --initial-cluster master1=https://192.168.200.201:2380,master2=https://192.168.200.202:2380,master3=https://192.168.200.203:2380 --initial-advertise-peer-urls https://192.168.200.201:2380 --data-dir /var/lib/etcd
#master2
etcdctl snapshot restore /root/snapshot.db --name master2 --initial-cluster master1=https://192.168.200.201:2380,master2=https://192.168.200.202:2380,master3=https://192.168.200.203:2380 --initial-advertise-peer-urls https://192.168.200.202:2380 --data-dir /var/lib/etcd
#master3
etcdctl snapshot restore snapshot.db --name master3 --initial-cluster master1=https://192.168.200.201:2380,master2=https://192.168.200.202:2380,master3=https://192.168.200.203:2380 --initial-advertise-peer-urls https://192.168.200.203:2380 --data-dir /var/lib/etcd
k8s集群修复
在 master 节点上,这个目录包含:
- etcd 的一组证书和 CA(在 /etc/kubernetes/pki/etcd 目录下)
- 一组 kubernetes 的证书和 CA(在 /etc/kubernetes/pki 目录下)
- 还有 kube-controller-manager、kube-scheduler、cluster-admin 以及 kubelet 这些使用的 kubeconfig 文件
- etcd、kube-apiserver、kube-scheduler 和 kube-controller-manager 的静态 Pod 资源清单文件(位于 /etc/kubernetes/manifests 目录)
-
导出当前集群配置
如果集群已不能正常访问导出不了配置,在后面执行kubeadm命令生成配置证书时手动指定--apiserver-advertise-address、--service-cidr等参数对应当前集群的配置或手动生成k8s.yml。
kubeadm config view > k8s.yml
apiServer:
extraArgs:
authorization-mode: Node,RBAC
timeoutForControlPlane: 4m0s
apiVersion: kubeadm.k8s.io/v1beta2
certificatesDir: /etc/kubernetes/pki
clusterName: kubernetes
controllerManager: {}
dns:
type: CoreDNS
etcd:
local:
dataDir: /var/lib/etcd
imageRepository: registry.aliyuncs.com/google_containers
kind: ClusterConfiguration
kubernetesVersion: v1.18.12
networking:
dnsDomain: cluster.local
podSubnet: 10.244.0.0/16
serviceSubnet: 10.10.0.0/16
scheduler: {} -
删除master上所有旧配置文件包括证书
rm -rf /etc/kubernetes
-
修复控制平面组件
在其中一个 master 节点上执行下面所有操作
生成 Kubernetes 的所有 SSL 证书,以及 Kubernetes 服务的静态 Pods 清单和 kubeconfigs 文件。
kubeadm init phase certs all --config k8s.yml
kubeadm init phase kubeconfig all --config k8s.yml
kubeadm init phase control-plane all --config k8s.yml
kubeadm init phase etcd local --config k8s.yml
cp -f /etc/kubernetes/admin.conf ~/.kube/config重启旧的所有控制平面组件。
docker ps |grep -E 'k8s_kube-apiserver|k8s_kube-controller-manager|k8s_kube-scheduler|k8s_etcd_etcd' | awk '{print $1}'|xargs docker restart
如果你使用 kubeadm 加入 kubelet,你还需要更新 kube-public 命名空间中的 cluster-info 配置,因为它仍然包含你的旧 CA 的哈希值。
kubeadm init phase bootstrap-token
由于其他 master 节点上的所有证书也必须由单一 CA 签署,手动复制下面证书到其他master节点,并在每个节点上重复上面命令。
scp /etc/kubernetes/pki/{ca,front-proxy-ca}.{key,crt} master2
scp /etc/kubernetes/pki/sa.{key,pub} master2作为手动复制证书的替代方法,你也可以使用 Kubernetes API,如下所示的命令:
kubeadm init phase upload-certs --upload-certs
kubeadm token create --print-join-command该命令将加密并上传证书到 Kubernetes,时间为2小时,可以使用下面方式在其他master上注册 master 节点:
# 更新现有master节点证书
kubeadm join phase control-plane-prepare all kubernetes-apiserver:6443 --control-plane --token cs0etm.ua7fbmwuf1jz946l --discovery-token-ca-cert-hash sha256:555f6ececd4721fed0269d27a5c7f1c6d7ef4614157a18e56ed9a1fd031a3ab8 --certificate-key 385655ee0ab98d2441ba8038b4e8d03184df1806733eac131511891d1096be73
# 加入新的master节点
kubeadm join kubernetes-apiserver:6443 --control-plane --token cs0etm.ua7fbmwuf1jz946l --discovery-token-ca-cert-hash sha256:555f6ececd4721fed0269d27a5c7f1c6d7ef4614157a18e56ed9a1fd031a3ab8 --certificate-key 385655ee0ab98d2441ba8038b4e8d03184df1806733eac131511891d1096be73需要注意的是,Kubernetes API 还有一个配置,它为 front-proxy 客户端持有 CA 证书,它用于验证从 apiserver 到 webhooks 和聚合层服务的请求。不过 kube-apiserver 会自动更新它。
-
修复工作节点
现在我们可以使用下面的命令列出集群的所有节点(master):
kubectl get node
当然正常现在所有节点的状态都是 NotReady,这是因为他们仍然还使用的是旧的证书,为了解决这个问题,我们将使用 kubeadm 来执行重新加入集群节点(master)。
systemctl stop kubelet
rm -rf /var/lib/kubelet/pki/ /etc/kubernetes/kubelet.conf
kubeadm init phase kubeconfig kubelet --config k8s.yml
kubeadm init phase kubelet-start --config k8s.yml但要加入工作节点,我们必须生成一个新的 token(master)。
kubeadm token create --print-join-command
然后在工作节点分别执行下面的命令(node):
systemctl stop kubelet
rm -rf /var/lib/kubelet/pki/ /etc/kubernetes/pki/ /etc/kubernetes/kubelet.conf
kubeadm join phase kubelet-start kubernetes-apiserver:6443 --token cs0etm.ua7fbmwuf1jz946l --discovery-token-ca-cert-hash sha256:555f6ececd4721fed0269d27a5c7f1c6d7ef4614157a18e56ed9a1fd031a3ab8你不需要删除 master 节点上的 /etc/kubernetes/pki 目录,因为它已经包含了所有需要的证书。
上面的操作会把你所有的 kubelet 重新加入到集群中,它并不会影响任何已经运行在上面的容器,但是,如果集群中有多个节点并且不同时进行,则可能会遇到一种情况,即 kube-controller-mananger 开始从 NotReady 节点重新创建容器,并尝试在活动节点上重新调度它们。
为了防止这种情况,我们可以暂时停掉 master 节点上的 controller-manager(可略)。
rm /etc/kubernetes/manifests/kube-controller-manager.yaml
crictl rmp $(crictl ps --name kube-controller-manager -q)一旦集群中的所有节点都被加入,你就可以为 controller-manager 生成一个静态资源清单,在所有 master 节点上运行下面的命令(可略)。
kubeadm init phase control-plane controller-manager --config k8s.yml
如果 kubelet 被配置为请求由你的 CA 签署的证书(选项serverTLSBootstrap: true),你还需要批准来自 kubelet 的 CSR(可略):
kubectl get csr
kubectl certificate approve <csr> -
更新calico中etcd证书(使用calico etcd存储情况下)
# Source: calico/templates/calico-etcd-secrets.yaml
# The following contains k8s Secrets for use with a TLS enabled etcd cluster.
# For information on populating Secrets, see http://kubernetes.io/docs/user-guide/secrets/
apiVersion: v1
kind: Secret
type: Opaque
metadata:
name: calico-etcd-secrets
namespace: kube-system
data:
# Populate the following with etcd TLS configuration if desired, but leave blank if
# not using TLS for etcd.
# The keys below should be uncommented and the values populated with the base64
# encoded contents of each file that would be associated with the TLS data.
# Example command for encoding a file contents: cat <file> | base64 -w 0
etcd-key: null --> null: #改成输出的内容cat /etc/kubernetes/pki/etcd/server.key | base64 -w 0
etcd-cert: null --> null: #改成输出的内容cat /etc/kubernetes/pki/etcd/server.crt | base64 -w 0
etcd-ca: null --> null: #改成输出的内容cat /etc/kubernetes/pki/etcd/ca.crt | base64 -w 0#kubectl edit secret calico-etcd-secrets -n kube-system
kubectl replace -f calico-etcd-secrets.yml -
修复 ServiceAccounts
因为我们丢失了 /etc/kubernetes/pki/sa.key ,这个 key 用于为集群中所有 ServiceAccounts 签署 jwt tokens,因此,我们必须为每个 sa 重新创建tokens。
这可以通过类型为 kubernetes.io/service-account-token 的 Secret 中删除 token 字段来完成。
kubectl get secret --all-namespaces | awk '/kubernetes.io\/service-account-token/ { print "kubectl patch secret -n " $1 " " $2 " -p {\\\"data\\\":{\\\"token\\\":null}}"}' | sh -x
删除之后,kube-controller-manager 会自动生成用新密钥签名的新令牌。不过需要注意的是并非所有的微服务都能即时更新 tokens,因此很可能需要手动重新启动使用 tokens 的容器(看实际情况再重启对应的pod)。
kubectl get pod --field-selector 'spec.serviceAccountName!=default' --no-headers -n kube-system | awk '{print "kubectl delete pod -n " $1 " " $2 " --wait=false --grace-period=0"}'
例如,这个命令会生成一个命令列表,会将所有使用非默认的 serviceAccount 的 Pod 删除,我建议从 kube-system 命名空间执行,因为 kube-proxy 和 CNI 插件都安装在这个命名空间中,它们对于处理你的微服务之间的通信至关重要。
到这里我们的集群就恢复完成了。
链接:
-
https://z.itpub.net/article/detail/75AC8916765D4CD9980915384D705E9F
-
https://itnext.io/breaking-down-and-fixing-kubernetes-4df2f22f87c3
k8s自定义host
coresdns
kubectl edit cm -n kube-system coredns
# 添加hosts内容(fallthrough必须存在)
apiVersion: v1
data:
Corefile: |
.:53 {
errors
health {
lameduck 5s
}
ready
kubernetes cluster.local in-addr.arpa ip6.arpa {
pods insecure
fallthrough in-addr.arpa ip6.arpa
ttl 30
}
hosts {
10.100.1.71 node1 hadoop01
10.100.1.72 node2 hadoop02
10.100.1.73 node3 hadoop03
fallthrough
}
prometheus :9153
forward . /etc/resolv.conf
cache 30
loop
reload
loadbalance
}
kind: ConfigMap
metadata:
creationTimestamp: "2020-12-03T09:28:23Z"
managedFields:
- apiVersion: v1
fieldsType: FieldsV1
fieldsV1:
f:data: {}
manager: kubeadm
operation: Update
time: "2020-12-03T09:28:23Z"
- apiVersion: v1
fieldsType: FieldsV1
fieldsV1:
f:data:
f:Corefile: {}
manager: kubectl
operation: Update
time: "2021-10-12T02:40:57Z"
name: coredns
namespace: kube-system
resourceVersion: "111183417"
selfLink: /api/v1/namespaces/kube-system/configmaps/coredns
uid: 337c1a28-cfe5-454d-8cbc-b6e15e64470d
hostAliases
apiVersion: apps/v1
kind: Deployment
metadata:
name: nginx-deployment
labels:
app: nginx
spec:
replicas: 3
selector:
matchLabels:
app: nginx
template:
metadata:
labels:
app: nginx
spec:
hostAliases:
- ip: 127.0.0.1
hostnames:
- foo.local
- bar.local
- ip: 10.1.2.3
hostnames:
- foo.remote
- bar.remote
containers:
- name: nginx
image: nginx:1.14.2
ports:
- containerPort: 80
k8s设置时区
通过环境变量设置:
apiVersion: v1
kind: Pod
metadata:
name: pod-env-tz
spec:
containers:
- name: ngx
image: nginx:latest
imagePullPolicy: IfNotPresent
env:
- name: TZ
value: Asia/Shanghai
通过挂载主机时区文件设置:
apiVersion: v1
kind: Pod
metadata:
name: pod-vol-tz
spec:
containers:
- name: ngx
image: nginx:latest
imagePullPolicy: IfNotPresent
volumeMounts:
- name: tz-config
mountPath: /etc/localtime
readOnly: true
volumes:
- name: tz-config
hostPath:
path: /etc/localtime
参考:https://developer.aliyun.com/article/637809#slide-0
k8s清理error pod
kubectl get pods -A | awk '/Evicted/{print $1,$2}' | xargs -r -n2 kubectl delete pod -n